x86, pebs: fix PEBS record size configuration
[linux-2.6/linux-acpi-2.6/ibm-acpi-2.6.git] / arch / x86 / kernel / ds.c
blob04e38ef646affce1b5e3472ca28efa55060bb2ed
1 /*
2 * Debug Store support
4 * This provides a low-level interface to the hardware's Debug Store
5 * feature that is used for branch trace store (BTS) and
6 * precise-event based sampling (PEBS).
8 * It manages:
9 * - per-thread and per-cpu allocation of BTS and PEBS
10 * - buffer memory allocation (optional)
11 * - buffer overflow handling
12 * - buffer access
14 * It assumes:
15 * - get_task_struct on all parameter tasks
16 * - current is allowed to trace parameter tasks
19 * Copyright (C) 2007-2008 Intel Corporation.
20 * Markus Metzger <markus.t.metzger@intel.com>, 2007-2008
24 #include <asm/ds.h>
26 #include <linux/errno.h>
27 #include <linux/string.h>
28 #include <linux/slab.h>
29 #include <linux/sched.h>
30 #include <linux/mm.h>
34 * The configuration for a particular DS hardware implementation.
36 struct ds_configuration {
37 /* the size of the DS structure in bytes */
38 unsigned char sizeof_ds;
39 /* the size of one pointer-typed field in the DS structure in bytes;
40 this covers the first 8 fields related to buffer management. */
41 unsigned char sizeof_field;
42 /* the size of a BTS/PEBS record in bytes */
43 unsigned char sizeof_rec[2];
45 static struct ds_configuration ds_cfg;
49 * Debug Store (DS) save area configuration (see Intel64 and IA32
50 * Architectures Software Developer's Manual, section 18.5)
52 * The DS configuration consists of the following fields; different
53 * architetures vary in the size of those fields.
54 * - double-word aligned base linear address of the BTS buffer
55 * - write pointer into the BTS buffer
56 * - end linear address of the BTS buffer (one byte beyond the end of
57 * the buffer)
58 * - interrupt pointer into BTS buffer
59 * (interrupt occurs when write pointer passes interrupt pointer)
60 * - double-word aligned base linear address of the PEBS buffer
61 * - write pointer into the PEBS buffer
62 * - end linear address of the PEBS buffer (one byte beyond the end of
63 * the buffer)
64 * - interrupt pointer into PEBS buffer
65 * (interrupt occurs when write pointer passes interrupt pointer)
66 * - value to which counter is reset following counter overflow
68 * Later architectures use 64bit pointers throughout, whereas earlier
69 * architectures use 32bit pointers in 32bit mode.
72 * We compute the base address for the first 8 fields based on:
73 * - the field size stored in the DS configuration
74 * - the relative field position
75 * - an offset giving the start of the respective region
77 * This offset is further used to index various arrays holding
78 * information for BTS and PEBS at the respective index.
80 * On later 32bit processors, we only access the lower 32bit of the
81 * 64bit pointer fields. The upper halves will be zeroed out.
84 enum ds_field {
85 ds_buffer_base = 0,
86 ds_index,
87 ds_absolute_maximum,
88 ds_interrupt_threshold,
91 enum ds_qualifier {
92 ds_bts = 0,
93 ds_pebs
96 static inline unsigned long ds_get(const unsigned char *base,
97 enum ds_qualifier qual, enum ds_field field)
99 base += (ds_cfg.sizeof_field * (field + (4 * qual)));
100 return *(unsigned long *)base;
103 static inline void ds_set(unsigned char *base, enum ds_qualifier qual,
104 enum ds_field field, unsigned long value)
106 base += (ds_cfg.sizeof_field * (field + (4 * qual)));
107 (*(unsigned long *)base) = value;
112 * Locking is done only for allocating BTS or PEBS resources and for
113 * guarding context and buffer memory allocation.
115 * Most functions require the current task to own the ds context part
116 * they are going to access. All the locking is done when validating
117 * access to the context.
119 static spinlock_t ds_lock = __SPIN_LOCK_UNLOCKED(ds_lock);
122 * Validate that the current task is allowed to access the BTS/PEBS
123 * buffer of the parameter task.
125 * Returns 0, if access is granted; -Eerrno, otherwise.
127 static inline int ds_validate_access(struct ds_context *context,
128 enum ds_qualifier qual)
130 if (!context)
131 return -EPERM;
133 if (context->owner[qual] == current)
134 return 0;
136 return -EPERM;
141 * We either support (system-wide) per-cpu or per-thread allocation.
142 * We distinguish the two based on the task_struct pointer, where a
143 * NULL pointer indicates per-cpu allocation for the current cpu.
145 * Allocations are use-counted. As soon as resources are allocated,
146 * further allocations must be of the same type (per-cpu or
147 * per-thread). We model this by counting allocations (i.e. the number
148 * of tracers of a certain type) for one type negatively:
149 * =0 no tracers
150 * >0 number of per-thread tracers
151 * <0 number of per-cpu tracers
153 * The below functions to get and put tracers and to check the
154 * allocation type require the ds_lock to be held by the caller.
156 * Tracers essentially gives the number of ds contexts for a certain
157 * type of allocation.
159 static long tracers;
161 static inline void get_tracer(struct task_struct *task)
163 tracers += (task ? 1 : -1);
166 static inline void put_tracer(struct task_struct *task)
168 tracers -= (task ? 1 : -1);
171 static inline int check_tracer(struct task_struct *task)
173 return (task ? (tracers >= 0) : (tracers <= 0));
178 * The DS context is either attached to a thread or to a cpu:
179 * - in the former case, the thread_struct contains a pointer to the
180 * attached context.
181 * - in the latter case, we use a static array of per-cpu context
182 * pointers.
184 * Contexts are use-counted. They are allocated on first access and
185 * deallocated when the last user puts the context.
187 * We distinguish between an allocating and a non-allocating get of a
188 * context:
189 * - the allocating get is used for requesting BTS/PEBS resources. It
190 * requires the caller to hold the global ds_lock.
191 * - the non-allocating get is used for all other cases. A
192 * non-existing context indicates an error. It acquires and releases
193 * the ds_lock itself for obtaining the context.
195 * A context and its DS configuration are allocated and deallocated
196 * together. A context always has a DS configuration of the
197 * appropriate size.
199 static DEFINE_PER_CPU(struct ds_context *, system_context);
201 #define this_system_context per_cpu(system_context, smp_processor_id())
204 * Returns the pointer to the parameter task's context or to the
205 * system-wide context, if task is NULL.
207 * Increases the use count of the returned context, if not NULL.
209 static inline struct ds_context *ds_get_context(struct task_struct *task)
211 struct ds_context *context;
213 spin_lock(&ds_lock);
215 context = (task ? task->thread.ds_ctx : this_system_context);
216 if (context)
217 context->count++;
219 spin_unlock(&ds_lock);
221 return context;
225 * Same as ds_get_context, but allocates the context and it's DS
226 * structure, if necessary; returns NULL; if out of memory.
228 * pre: requires ds_lock to be held
230 static inline struct ds_context *ds_alloc_context(struct task_struct *task)
232 struct ds_context **p_context =
233 (task ? &task->thread.ds_ctx : &this_system_context);
234 struct ds_context *context = *p_context;
236 if (!context) {
237 spin_unlock(&ds_lock);
239 context = kzalloc(sizeof(*context), GFP_KERNEL);
241 if (!context) {
242 spin_lock(&ds_lock);
243 return NULL;
246 context->ds = kzalloc(ds_cfg.sizeof_ds, GFP_KERNEL);
247 if (!context->ds) {
248 kfree(context);
249 spin_lock(&ds_lock);
250 return NULL;
253 spin_lock(&ds_lock);
255 * Check for race - another CPU could have allocated
256 * it meanwhile:
258 if (*p_context) {
259 kfree(context->ds);
260 kfree(context);
261 return *p_context;
264 *p_context = context;
266 context->this = p_context;
267 context->task = task;
269 if (task)
270 set_tsk_thread_flag(task, TIF_DS_AREA_MSR);
272 if (!task || (task == current))
273 wrmsr(MSR_IA32_DS_AREA, (unsigned long)context->ds, 0);
275 get_tracer(task);
278 context->count++;
280 return context;
284 * Decreases the use count of the parameter context, if not NULL.
285 * Deallocates the context, if the use count reaches zero.
287 static inline void ds_put_context(struct ds_context *context)
289 if (!context)
290 return;
292 spin_lock(&ds_lock);
294 if (--context->count)
295 goto out;
297 *(context->this) = NULL;
299 if (context->task)
300 clear_tsk_thread_flag(context->task, TIF_DS_AREA_MSR);
302 if (!context->task || (context->task == current))
303 wrmsrl(MSR_IA32_DS_AREA, 0);
305 put_tracer(context->task);
307 /* free any leftover buffers from tracers that did not
308 * deallocate them properly. */
309 kfree(context->buffer[ds_bts]);
310 kfree(context->buffer[ds_pebs]);
311 kfree(context->ds);
312 kfree(context);
313 out:
314 spin_unlock(&ds_lock);
319 * Handle a buffer overflow
321 * task: the task whose buffers are overflowing;
322 * NULL for a buffer overflow on the current cpu
323 * context: the ds context
324 * qual: the buffer type
326 static void ds_overflow(struct task_struct *task, struct ds_context *context,
327 enum ds_qualifier qual)
329 if (!context)
330 return;
332 if (context->callback[qual])
333 (*context->callback[qual])(task);
335 /* todo: do some more overflow handling */
340 * Allocate a non-pageable buffer of the parameter size.
341 * Checks the memory and the locked memory rlimit.
343 * Returns the buffer, if successful;
344 * NULL, if out of memory or rlimit exceeded.
346 * size: the requested buffer size in bytes
347 * pages (out): if not NULL, contains the number of pages reserved
349 static inline void *ds_allocate_buffer(size_t size, unsigned int *pages)
351 unsigned long rlim, vm, pgsz;
352 void *buffer;
354 pgsz = PAGE_ALIGN(size) >> PAGE_SHIFT;
356 rlim = current->signal->rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT;
357 vm = current->mm->total_vm + pgsz;
358 if (rlim < vm)
359 return NULL;
361 rlim = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT;
362 vm = current->mm->locked_vm + pgsz;
363 if (rlim < vm)
364 return NULL;
366 buffer = kzalloc(size, GFP_KERNEL);
367 if (!buffer)
368 return NULL;
370 current->mm->total_vm += pgsz;
371 current->mm->locked_vm += pgsz;
373 if (pages)
374 *pages = pgsz;
376 return buffer;
379 static int ds_request(struct task_struct *task, void *base, size_t size,
380 ds_ovfl_callback_t ovfl, enum ds_qualifier qual)
382 struct ds_context *context;
383 unsigned long buffer, adj;
384 const unsigned long alignment = (1 << 3);
385 int error = 0;
387 if (!ds_cfg.sizeof_ds)
388 return -EOPNOTSUPP;
390 /* we require some space to do alignment adjustments below */
391 if (size < (alignment + ds_cfg.sizeof_rec[qual]))
392 return -EINVAL;
394 /* buffer overflow notification is not yet implemented */
395 if (ovfl)
396 return -EOPNOTSUPP;
399 spin_lock(&ds_lock);
401 error = -ENOMEM;
402 context = ds_alloc_context(task);
403 if (!context)
404 goto out_unlock;
406 error = -EPERM;
407 if (!check_tracer(task))
408 goto out_unlock;
410 error = -EALREADY;
411 if (context->owner[qual] == current)
412 goto out_unlock;
413 error = -EPERM;
414 if (context->owner[qual] != NULL)
415 goto out_unlock;
416 context->owner[qual] = current;
418 spin_unlock(&ds_lock);
421 error = -ENOMEM;
422 if (!base) {
423 base = ds_allocate_buffer(size, &context->pages[qual]);
424 if (!base)
425 goto out_release;
427 context->buffer[qual] = base;
429 error = 0;
431 context->callback[qual] = ovfl;
433 /* adjust the buffer address and size to meet alignment
434 * constraints:
435 * - buffer is double-word aligned
436 * - size is multiple of record size
438 * We checked the size at the very beginning; we have enough
439 * space to do the adjustment.
441 buffer = (unsigned long)base;
443 adj = ALIGN(buffer, alignment) - buffer;
444 buffer += adj;
445 size -= adj;
447 size /= ds_cfg.sizeof_rec[qual];
448 size *= ds_cfg.sizeof_rec[qual];
450 ds_set(context->ds, qual, ds_buffer_base, buffer);
451 ds_set(context->ds, qual, ds_index, buffer);
452 ds_set(context->ds, qual, ds_absolute_maximum, buffer + size);
454 if (ovfl) {
455 /* todo: select a suitable interrupt threshold */
456 } else
457 ds_set(context->ds, qual,
458 ds_interrupt_threshold, buffer + size + 1);
460 /* we keep the context until ds_release */
461 return error;
463 out_release:
464 context->owner[qual] = NULL;
465 ds_put_context(context);
466 return error;
468 out_unlock:
469 spin_unlock(&ds_lock);
470 ds_put_context(context);
471 return error;
474 int ds_request_bts(struct task_struct *task, void *base, size_t size,
475 ds_ovfl_callback_t ovfl)
477 return ds_request(task, base, size, ovfl, ds_bts);
480 int ds_request_pebs(struct task_struct *task, void *base, size_t size,
481 ds_ovfl_callback_t ovfl)
483 return ds_request(task, base, size, ovfl, ds_pebs);
486 static int ds_release(struct task_struct *task, enum ds_qualifier qual)
488 struct ds_context *context;
489 int error;
491 context = ds_get_context(task);
492 error = ds_validate_access(context, qual);
493 if (error < 0)
494 goto out;
496 kfree(context->buffer[qual]);
497 context->buffer[qual] = NULL;
499 current->mm->total_vm -= context->pages[qual];
500 current->mm->locked_vm -= context->pages[qual];
501 context->pages[qual] = 0;
502 context->owner[qual] = NULL;
505 * we put the context twice:
506 * once for the ds_get_context
507 * once for the corresponding ds_request
509 ds_put_context(context);
510 out:
511 ds_put_context(context);
512 return error;
515 int ds_release_bts(struct task_struct *task)
517 return ds_release(task, ds_bts);
520 int ds_release_pebs(struct task_struct *task)
522 return ds_release(task, ds_pebs);
525 static int ds_get_index(struct task_struct *task, size_t *pos,
526 enum ds_qualifier qual)
528 struct ds_context *context;
529 unsigned long base, index;
530 int error;
532 context = ds_get_context(task);
533 error = ds_validate_access(context, qual);
534 if (error < 0)
535 goto out;
537 base = ds_get(context->ds, qual, ds_buffer_base);
538 index = ds_get(context->ds, qual, ds_index);
540 error = ((index - base) / ds_cfg.sizeof_rec[qual]);
541 if (pos)
542 *pos = error;
543 out:
544 ds_put_context(context);
545 return error;
548 int ds_get_bts_index(struct task_struct *task, size_t *pos)
550 return ds_get_index(task, pos, ds_bts);
553 int ds_get_pebs_index(struct task_struct *task, size_t *pos)
555 return ds_get_index(task, pos, ds_pebs);
558 static int ds_get_end(struct task_struct *task, size_t *pos,
559 enum ds_qualifier qual)
561 struct ds_context *context;
562 unsigned long base, end;
563 int error;
565 context = ds_get_context(task);
566 error = ds_validate_access(context, qual);
567 if (error < 0)
568 goto out;
570 base = ds_get(context->ds, qual, ds_buffer_base);
571 end = ds_get(context->ds, qual, ds_absolute_maximum);
573 error = ((end - base) / ds_cfg.sizeof_rec[qual]);
574 if (pos)
575 *pos = error;
576 out:
577 ds_put_context(context);
578 return error;
581 int ds_get_bts_end(struct task_struct *task, size_t *pos)
583 return ds_get_end(task, pos, ds_bts);
586 int ds_get_pebs_end(struct task_struct *task, size_t *pos)
588 return ds_get_end(task, pos, ds_pebs);
591 static int ds_access(struct task_struct *task, size_t index,
592 const void **record, enum ds_qualifier qual)
594 struct ds_context *context;
595 unsigned long base, idx;
596 int error;
598 if (!record)
599 return -EINVAL;
601 context = ds_get_context(task);
602 error = ds_validate_access(context, qual);
603 if (error < 0)
604 goto out;
606 base = ds_get(context->ds, qual, ds_buffer_base);
607 idx = base + (index * ds_cfg.sizeof_rec[qual]);
609 error = -EINVAL;
610 if (idx > ds_get(context->ds, qual, ds_absolute_maximum))
611 goto out;
613 *record = (const void *)idx;
614 error = ds_cfg.sizeof_rec[qual];
615 out:
616 ds_put_context(context);
617 return error;
620 int ds_access_bts(struct task_struct *task, size_t index, const void **record)
622 return ds_access(task, index, record, ds_bts);
625 int ds_access_pebs(struct task_struct *task, size_t index, const void **record)
627 return ds_access(task, index, record, ds_pebs);
630 static int ds_write(struct task_struct *task, const void *record, size_t size,
631 enum ds_qualifier qual, int force)
633 struct ds_context *context;
634 int error;
636 if (!record)
637 return -EINVAL;
639 error = -EPERM;
640 context = ds_get_context(task);
641 if (!context)
642 goto out;
644 if (!force) {
645 error = ds_validate_access(context, qual);
646 if (error < 0)
647 goto out;
650 error = 0;
651 while (size) {
652 unsigned long base, index, end, write_end, int_th;
653 unsigned long write_size, adj_write_size;
656 * write as much as possible without producing an
657 * overflow interrupt.
659 * interrupt_threshold must either be
660 * - bigger than absolute_maximum or
661 * - point to a record between buffer_base and absolute_maximum
663 * index points to a valid record.
665 base = ds_get(context->ds, qual, ds_buffer_base);
666 index = ds_get(context->ds, qual, ds_index);
667 end = ds_get(context->ds, qual, ds_absolute_maximum);
668 int_th = ds_get(context->ds, qual, ds_interrupt_threshold);
670 write_end = min(end, int_th);
672 /* if we are already beyond the interrupt threshold,
673 * we fill the entire buffer */
674 if (write_end <= index)
675 write_end = end;
677 if (write_end <= index)
678 goto out;
680 write_size = min((unsigned long) size, write_end - index);
681 memcpy((void *)index, record, write_size);
683 record = (const char *)record + write_size;
684 size -= write_size;
685 error += write_size;
687 adj_write_size = write_size / ds_cfg.sizeof_rec[qual];
688 adj_write_size *= ds_cfg.sizeof_rec[qual];
690 /* zero out trailing bytes */
691 memset((char *)index + write_size, 0,
692 adj_write_size - write_size);
693 index += adj_write_size;
695 if (index >= end)
696 index = base;
697 ds_set(context->ds, qual, ds_index, index);
699 if (index >= int_th)
700 ds_overflow(task, context, qual);
703 out:
704 ds_put_context(context);
705 return error;
708 int ds_write_bts(struct task_struct *task, const void *record, size_t size)
710 return ds_write(task, record, size, ds_bts, /* force = */ 0);
713 int ds_write_pebs(struct task_struct *task, const void *record, size_t size)
715 return ds_write(task, record, size, ds_pebs, /* force = */ 0);
718 int ds_unchecked_write_bts(struct task_struct *task,
719 const void *record, size_t size)
721 return ds_write(task, record, size, ds_bts, /* force = */ 1);
724 int ds_unchecked_write_pebs(struct task_struct *task,
725 const void *record, size_t size)
727 return ds_write(task, record, size, ds_pebs, /* force = */ 1);
730 static int ds_reset_or_clear(struct task_struct *task,
731 enum ds_qualifier qual, int clear)
733 struct ds_context *context;
734 unsigned long base, end;
735 int error;
737 context = ds_get_context(task);
738 error = ds_validate_access(context, qual);
739 if (error < 0)
740 goto out;
742 base = ds_get(context->ds, qual, ds_buffer_base);
743 end = ds_get(context->ds, qual, ds_absolute_maximum);
745 if (clear)
746 memset((void *)base, 0, end - base);
748 ds_set(context->ds, qual, ds_index, base);
750 error = 0;
751 out:
752 ds_put_context(context);
753 return error;
756 int ds_reset_bts(struct task_struct *task)
758 return ds_reset_or_clear(task, ds_bts, /* clear = */ 0);
761 int ds_reset_pebs(struct task_struct *task)
763 return ds_reset_or_clear(task, ds_pebs, /* clear = */ 0);
766 int ds_clear_bts(struct task_struct *task)
768 return ds_reset_or_clear(task, ds_bts, /* clear = */ 1);
771 int ds_clear_pebs(struct task_struct *task)
773 return ds_reset_or_clear(task, ds_pebs, /* clear = */ 1);
776 int ds_get_pebs_reset(struct task_struct *task, u64 *value)
778 struct ds_context *context;
779 int error;
781 if (!value)
782 return -EINVAL;
784 context = ds_get_context(task);
785 error = ds_validate_access(context, ds_pebs);
786 if (error < 0)
787 goto out;
789 *value = *(u64 *)(context->ds + (ds_cfg.sizeof_field * 8));
791 error = 0;
792 out:
793 ds_put_context(context);
794 return error;
797 int ds_set_pebs_reset(struct task_struct *task, u64 value)
799 struct ds_context *context;
800 int error;
802 context = ds_get_context(task);
803 error = ds_validate_access(context, ds_pebs);
804 if (error < 0)
805 goto out;
807 *(u64 *)(context->ds + (ds_cfg.sizeof_field * 8)) = value;
809 error = 0;
810 out:
811 ds_put_context(context);
812 return error;
815 static const struct ds_configuration ds_cfg_var = {
816 .sizeof_ds = sizeof(long) * 12,
817 .sizeof_field = sizeof(long),
818 .sizeof_rec[ds_bts] = sizeof(long) * 3,
819 #ifdef __i386__
820 .sizeof_rec[ds_pebs] = sizeof(long) * 10
821 #else
822 .sizeof_rec[ds_pebs] = sizeof(long) * 18
823 #endif
825 static const struct ds_configuration ds_cfg_64 = {
826 .sizeof_ds = 8 * 12,
827 .sizeof_field = 8,
828 .sizeof_rec[ds_bts] = 8 * 3,
829 #ifdef __i386__
830 .sizeof_rec[ds_pebs] = 8 * 10
831 #else
832 .sizeof_rec[ds_pebs] = 8 * 18
833 #endif
836 static inline void
837 ds_configure(const struct ds_configuration *cfg)
839 ds_cfg = *cfg;
842 void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)
844 switch (c->x86) {
845 case 0x6:
846 switch (c->x86_model) {
847 case 0xD:
848 case 0xE: /* Pentium M */
849 ds_configure(&ds_cfg_var);
850 break;
851 case 0xF: /* Core2 */
852 case 0x1C: /* Atom */
853 ds_configure(&ds_cfg_64);
854 break;
855 default:
856 /* sorry, don't know about them */
857 break;
859 break;
860 case 0xF:
861 switch (c->x86_model) {
862 case 0x0:
863 case 0x1:
864 case 0x2: /* Netburst */
865 ds_configure(&ds_cfg_var);
866 break;
867 default:
868 /* sorry, don't know about them */
869 break;
871 break;
872 default:
873 /* sorry, don't know about them */
874 break;
878 void ds_free(struct ds_context *context)
880 /* This is called when the task owning the parameter context
881 * is dying. There should not be any user of that context left
882 * to disturb us, anymore. */
883 unsigned long leftovers = context->count;
884 while (leftovers--)
885 ds_put_context(context);