drivers/oprofile/cpu_buffer.c

   1 /**
   2  * @file cpu_buffer.c
   3  *
   4  * @remark Copyright 2002-2009 OProfile authors
   5  * @remark Read the file COPYING
   6  *
   7  * @author John Levon <levon@movementarian.org>
   8  * @author Barry Kasindorf <barry.kasindorf@amd.com>
   9  * @author Robert Richter <robert.richter@amd.com>
  10  *
  11  * Each CPU has a local buffer that stores PC value/event
  12  * pairs. We also log context switches when we notice them.
  13  * Eventually each CPU's buffer is processed into the global
  14  * event buffer by sync_buffer().
  15  *
  16  * We use a local buffer for two reasons: an NMI or similar
  17  * interrupt cannot synchronise, and high sampling rates
  18  * would lead to catastrophic global synchronisation if
  19  * a global buffer was used.
  20  */
  21
  22 #include <linux/sched.h>
  23 #include <linux/oprofile.h>
  24 #include <linux/errno.h>
  25
  26 #include "event_buffer.h"
  27 #include "cpu_buffer.h"
  28 #include "buffer_sync.h"
  29 #include "oprof.h"
  30
  31 #define OP_BUFFER_FLAGS 0
  32
  33 /*
  34  * Read and write access is using spin locking. Thus, writing to the
  35  * buffer by NMI handler (x86) could occur also during critical
  36  * sections when reading the buffer. To avoid this, there are 2
  37  * buffers for independent read and write access. Read access is in
  38  * process context only, write access only in the NMI handler. If the
  39  * read buffer runs empty, both buffers are swapped atomically. There
  40  * is potentially a small window during swapping where the buffers are
  41  * disabled and samples could be lost.
  42  *
  43  * Using 2 buffers is a little bit overhead, but the solution is clear
  44  * and does not require changes in the ring buffer implementation. It
  45  * can be changed to a single buffer solution when the ring buffer
  46  * access is implemented as non-locking atomic code.
  47  */
  48 static struct ring_buffer *op_ring_buffer_read;
  49 static struct ring_buffer *op_ring_buffer_write;
  50 DEFINE_PER_CPU(struct oprofile_cpu_buffer, cpu_buffer);
  51
  52 static void wq_sync_buffer(struct work_struct *work);
  53
  54 #define DEFAULT_TIMER_EXPIRE (HZ / 10)
  55 static int work_enabled;
  56
  57 unsigned long oprofile_get_cpu_buffer_size(void)
  58 {
  59         return oprofile_cpu_buffer_size;
  60 }
  61
  62 void oprofile_cpu_buffer_inc_smpl_lost(void)
  63 {
  64         struct oprofile_cpu_buffer *cpu_buf
  65                 = &__get_cpu_var(cpu_buffer);
  66
  67         cpu_buf->sample_lost_overflow++;
  68 }
  69
  70 void free_cpu_buffers(void)
  71 {
  72         if (op_ring_buffer_read)
  73                 ring_buffer_free(op_ring_buffer_read);
  74         op_ring_buffer_read = NULL;
  75         if (op_ring_buffer_write)
  76                 ring_buffer_free(op_ring_buffer_write);
  77         op_ring_buffer_write = NULL;
  78 }
  79
  80 #define RB_EVENT_HDR_SIZE 4
  81
  82 int alloc_cpu_buffers(void)
  83 {
  84         int i;
  85
  86         unsigned long buffer_size = oprofile_cpu_buffer_size;
  87         unsigned long byte_size = buffer_size * (sizeof(struct op_sample) +
  88                                                  RB_EVENT_HDR_SIZE);
  89
  90         op_ring_buffer_read = ring_buffer_alloc(byte_size, OP_BUFFER_FLAGS);
  91         if (!op_ring_buffer_read)
  92                 goto fail;
  93         op_ring_buffer_write = ring_buffer_alloc(byte_size, OP_BUFFER_FLAGS);
  94         if (!op_ring_buffer_write)
  95                 goto fail;
  96
  97         for_each_possible_cpu(i) {
  98                 struct oprofile_cpu_buffer *b = &per_cpu(cpu_buffer, i);
  99
 100                 b->last_task = NULL;
 101                 b->last_is_kernel = -1;
 102                 b->tracing = 0;
 103                 b->buffer_size = buffer_size;
 104                 b->sample_received = 0;
 105                 b->sample_lost_overflow = 0;
 106                 b->backtrace_aborted = 0;
 107                 b->sample_invalid_eip = 0;
 108                 b->cpu = i;
 109                 INIT_DELAYED_WORK(&b->work, wq_sync_buffer);
 110         }
 111         return 0;
 112
 113 fail:
 114         free_cpu_buffers();
 115         return -ENOMEM;
 116 }
 117
 118 void start_cpu_work(void)
 119 {
 120         int i;
 121
 122         work_enabled = 1;
 123
 124         for_each_online_cpu(i) {
 125                 struct oprofile_cpu_buffer *b = &per_cpu(cpu_buffer, i);
 126
 127                 /*
 128                  * Spread the work by 1 jiffy per cpu so they dont all
 129                  * fire at once.
 130                  */
 131                 schedule_delayed_work_on(i, &b->work, DEFAULT_TIMER_EXPIRE + i);
 132         }
 133 }
 134
 135 void end_cpu_work(void)
 136 {
 137         int i;
 138
 139         work_enabled = 0;
 140
 141         for_each_online_cpu(i) {
 142                 struct oprofile_cpu_buffer *b = &per_cpu(cpu_buffer, i);
 143
 144                 cancel_delayed_work(&b->work);
 145         }
 146
 147         flush_scheduled_work();
 148 }
 149
 150 /*
 151  * This function prepares the cpu buffer to write a sample.
 152  *
 153  * Struct op_entry is used during operations on the ring buffer while
 154  * struct op_sample contains the data that is stored in the ring
 155  * buffer. Struct entry can be uninitialized. The function reserves a
 156  * data array that is specified by size. Use
 157  * op_cpu_buffer_write_commit() after preparing the sample. In case of
 158  * errors a null pointer is returned, otherwise the pointer to the
 159  * sample.
 160  *
 161  */
 162 struct op_sample
 163 *op_cpu_buffer_write_reserve(struct op_entry *entry, unsigned long size)
 164 {
 165         entry->event = ring_buffer_lock_reserve
 166                 (op_ring_buffer_write, sizeof(struct op_sample) +
 167                  size * sizeof(entry->sample->data[0]));
 168         if (entry->event)
 169                 entry->sample = ring_buffer_event_data(entry->event);
 170         else
 171                 entry->sample = NULL;
 172
 173         if (!entry->sample)
 174                 return NULL;
 175
 176         entry->size = size;
 177         entry->data = entry->sample->data;
 178
 179         return entry->sample;
 180 }
 181
 182 int op_cpu_buffer_write_commit(struct op_entry *entry)
 183 {
 184         return ring_buffer_unlock_commit(op_ring_buffer_write, entry->event);
 185 }
 186
 187 struct op_sample *op_cpu_buffer_read_entry(struct op_entry *entry, int cpu)
 188 {
 189         struct ring_buffer_event *e;
 190         e = ring_buffer_consume(op_ring_buffer_read, cpu, NULL);
 191         if (e)
 192                 goto event;
 193         if (ring_buffer_swap_cpu(op_ring_buffer_read,
 194                                  op_ring_buffer_write,
 195                                  cpu))
 196                 return NULL;
 197         e = ring_buffer_consume(op_ring_buffer_read, cpu, NULL);
 198         if (e)
 199                 goto event;
 200         return NULL;
 201
 202 event:
 203         entry->event = e;
 204         entry->sample = ring_buffer_event_data(e);
 205         entry->size = (ring_buffer_event_length(e) - sizeof(struct op_sample))
 206                 / sizeof(entry->sample->data[0]);
 207         entry->data = entry->sample->data;
 208         return entry->sample;
 209 }
 210
 211 unsigned long op_cpu_buffer_entries(int cpu)
 212 {
 213         return ring_buffer_entries_cpu(op_ring_buffer_read, cpu)
 214                 + ring_buffer_entries_cpu(op_ring_buffer_write, cpu);
 215 }
 216
 217 static int
 218 op_add_code(struct oprofile_cpu_buffer *cpu_buf, unsigned long backtrace,
 219             int is_kernel, struct task_struct *task)
 220 {
 221         struct op_entry entry;
 222         struct op_sample *sample;
 223         unsigned long flags;
 224         int size;
 225
 226         flags = 0;
 227
 228         if (backtrace)
 229                 flags |= TRACE_BEGIN;
 230
 231         /* notice a switch from user->kernel or vice versa */
 232         is_kernel = !!is_kernel;
 233         if (cpu_buf->last_is_kernel != is_kernel) {
 234                 cpu_buf->last_is_kernel = is_kernel;
 235                 flags |= KERNEL_CTX_SWITCH;
 236                 if (is_kernel)
 237                         flags |= IS_KERNEL;
 238         }
 239
 240         /* notice a task switch */
 241         if (cpu_buf->last_task != task) {
 242                 cpu_buf->last_task = task;
 243                 flags |= USER_CTX_SWITCH;
 244         }
 245
 246         if (!flags)
 247                 /* nothing to do */
 248                 return 0;
 249
 250         if (flags & USER_CTX_SWITCH)
 251                 size = 1;
 252         else
 253                 size = 0;
 254
 255         sample = op_cpu_buffer_write_reserve(&entry, size);
 256         if (!sample)
 257                 return -ENOMEM;
 258
 259         sample->eip = ESCAPE_CODE;
 260         sample->event = flags;
 261
 262         if (size)
 263                 op_cpu_buffer_add_data(&entry, (unsigned long)task);
 264
 265         op_cpu_buffer_write_commit(&entry);
 266
 267         return 0;
 268 }
 269
 270 static inline int
 271 op_add_sample(struct oprofile_cpu_buffer *cpu_buf,
 272               unsigned long pc, unsigned long event)
 273 {
 274         struct op_entry entry;
 275         struct op_sample *sample;
 276
 277         sample = op_cpu_buffer_write_reserve(&entry, 0);
 278         if (!sample)
 279                 return -ENOMEM;
 280
 281         sample->eip = pc;
 282         sample->event = event;
 283
 284         return op_cpu_buffer_write_commit(&entry);
 285 }
 286
 287 /*
 288  * This must be safe from any context.
 289  *
 290  * is_kernel is needed because on some architectures you cannot
 291  * tell if you are in kernel or user space simply by looking at
 292  * pc. We tag this in the buffer by generating kernel enter/exit
 293  * events whenever is_kernel changes
 294  */
 295 static int
 296 log_sample(struct oprofile_cpu_buffer *cpu_buf, unsigned long pc,
 297            unsigned long backtrace, int is_kernel, unsigned long event)
 298 {
 299         cpu_buf->sample_received++;
 300
 301         if (pc == ESCAPE_CODE) {
 302                 cpu_buf->sample_invalid_eip++;
 303                 return 0;
 304         }
 305
 306         if (op_add_code(cpu_buf, backtrace, is_kernel, current))
 307                 goto fail;
 308
 309         if (op_add_sample(cpu_buf, pc, event))
 310                 goto fail;
 311
 312         return 1;
 313
 314 fail:
 315         cpu_buf->sample_lost_overflow++;
 316         return 0;
 317 }
 318
 319 static inline void oprofile_begin_trace(struct oprofile_cpu_buffer *cpu_buf)
 320 {
 321         cpu_buf->tracing = 1;
 322 }
 323
 324 static inline void oprofile_end_trace(struct oprofile_cpu_buffer *cpu_buf)
 325 {
 326         cpu_buf->tracing = 0;
 327 }
 328
 329 static inline void
 330 __oprofile_add_ext_sample(unsigned long pc, struct pt_regs * const regs,
 331                           unsigned long event, int is_kernel)
 332 {
 333         struct oprofile_cpu_buffer *cpu_buf = &__get_cpu_var(cpu_buffer);
 334         unsigned long backtrace = oprofile_backtrace_depth;
 335
 336         /*
 337          * if log_sample() fail we can't backtrace since we lost the
 338          * source of this event
 339          */
 340         if (!log_sample(cpu_buf, pc, backtrace, is_kernel, event))
 341                 /* failed */
 342                 return;
 343
 344         if (!backtrace)
 345                 return;
 346
 347         oprofile_begin_trace(cpu_buf);
 348         oprofile_ops.backtrace(regs, backtrace);
 349         oprofile_end_trace(cpu_buf);
 350 }
 351
 352 void oprofile_add_ext_sample(unsigned long pc, struct pt_regs * const regs,
 353                              unsigned long event, int is_kernel)
 354 {
 355         __oprofile_add_ext_sample(pc, regs, event, is_kernel);
 356 }
 357
 358 void oprofile_add_sample(struct pt_regs * const regs, unsigned long event)
 359 {
 360         int is_kernel = !user_mode(regs);
 361         unsigned long pc = profile_pc(regs);
 362
 363         __oprofile_add_ext_sample(pc, regs, event, is_kernel);
 364 }
 365
 366 /*
 367  * Add samples with data to the ring buffer.
 368  *
 369  * Use oprofile_add_data(&entry, val) to add data and
 370  * oprofile_write_commit(&entry) to commit the sample.
 371  */
 372 void
 373 oprofile_write_reserve(struct op_entry *entry, struct pt_regs * const regs,
 374                        unsigned long pc, int code, int size)
 375 {
 376         struct op_sample *sample;
 377         int is_kernel = !user_mode(regs);
 378         struct oprofile_cpu_buffer *cpu_buf = &__get_cpu_var(cpu_buffer);
 379
 380         cpu_buf->sample_received++;
 381
 382         /* no backtraces for samples with data */
 383         if (op_add_code(cpu_buf, 0, is_kernel, current))
 384                 goto fail;
 385
 386         sample = op_cpu_buffer_write_reserve(entry, size + 2);
 387         if (!sample)
 388                 goto fail;
 389         sample->eip = ESCAPE_CODE;
 390         sample->event = 0;              /* no flags */
 391
 392         op_cpu_buffer_add_data(entry, code);
 393         op_cpu_buffer_add_data(entry, pc);
 394
 395         return;
 396
 397 fail:
 398         entry->event = NULL;
 399         cpu_buf->sample_lost_overflow++;
 400 }
 401
 402 int oprofile_add_data(struct op_entry *entry, unsigned long val)
 403 {
 404         if (!entry->event)
 405                 return 0;
 406         return op_cpu_buffer_add_data(entry, val);
 407 }
 408
 409 int oprofile_add_data64(struct op_entry *entry, u64 val)
 410 {
 411         if (!entry->event)
 412                 return 0;
 413         if (op_cpu_buffer_get_size(entry) < 2)
 414                 /*
 415                  * the function returns 0 to indicate a too small
 416                  * buffer, even if there is some space left
 417                  */
 418                 return 0;
 419         if (!op_cpu_buffer_add_data(entry, (u32)val))
 420                 return 0;
 421         return op_cpu_buffer_add_data(entry, (u32)(val >> 32));
 422 }
 423
 424 int oprofile_write_commit(struct op_entry *entry)
 425 {
 426         if (!entry->event)
 427                 return -EINVAL;
 428         return op_cpu_buffer_write_commit(entry);
 429 }
 430
 431 void oprofile_add_pc(unsigned long pc, int is_kernel, unsigned long event)
 432 {
 433         struct oprofile_cpu_buffer *cpu_buf = &__get_cpu_var(cpu_buffer);
 434         log_sample(cpu_buf, pc, 0, is_kernel, event);
 435 }
 436
 437 void oprofile_add_trace(unsigned long pc)
 438 {
 439         struct oprofile_cpu_buffer *cpu_buf = &__get_cpu_var(cpu_buffer);
 440
 441         if (!cpu_buf->tracing)
 442                 return;
 443
 444         /*
 445          * broken frame can give an eip with the same value as an
 446          * escape code, abort the trace if we get it
 447          */
 448         if (pc == ESCAPE_CODE)
 449                 goto fail;
 450
 451         if (op_add_sample(cpu_buf, pc, 0))
 452                 goto fail;
 453
 454         return;
 455 fail:
 456         cpu_buf->tracing = 0;
 457         cpu_buf->backtrace_aborted++;
 458         return;
 459 }
 460
 461 /*
 462  * This serves to avoid cpu buffer overflow, and makes sure
 463  * the task mortuary progresses
 464  *
 465  * By using schedule_delayed_work_on and then schedule_delayed_work
 466  * we guarantee this will stay on the correct cpu
 467  */
 468 static void wq_sync_buffer(struct work_struct *work)
 469 {
 470         struct oprofile_cpu_buffer *b =
 471                 container_of(work, struct oprofile_cpu_buffer, work.work);
 472         if (b->cpu != smp_processor_id()) {
 473                 printk(KERN_DEBUG "WQ on CPU%d, prefer CPU%d\n",
 474                        smp_processor_id(), b->cpu);
 475
 476                 if (!cpu_online(b->cpu)) {
 477                         cancel_delayed_work(&b->work);
 478                         return;
 479                 }
 480         }
 481         sync_buffer(b->cpu);
 482
 483         /* don't re-add the work if we're shutting down */
 484         if (work_enabled)
 485                 schedule_delayed_work(&b->work, DEFAULT_TIMER_EXPIRE);
 486 }