2 * pstat - Linux performance counter subsystem uspace or kthread tracing
4 * Borrowed some code from libperf, which has been written by
5 * Copyright 2010 Wolfgang Richter <wolf@cs.cmu.edu>
6 * Copyright 2010 Ekaterina Taralova <etaralova@cs.cmu.edu>
7 * Copyright 2010 Karl Naden <kbn@cs.cmu.edu>
10 * Performance events, data type definitions, declarations by
11 * Copyright 2008-2009 Thomas Gleixner <tglx@linutronix.de>
12 * Copyright 2008-2009 Ingo Molnar <mingo@redhat.com>
13 * Copyright 2008-2009 Peter Zijlstra <pzijlstr@redhat.com>
14 * Copyright 2009 Paul Mackerras <paulus@au1.ibm.com>
15 * Subject to the GPL / see COPYING.
17 * pstat has been written by
18 * Copyright 2011 Daniel Borkmann <dborkma@tik.ee.ethz.ch>
19 * Swiss federal institute of technology (ETH Zurich)
22 * Needs Linux kernel >= 2.6.32. For more detailed information have a look at
23 * tools/perf/design.txt and http://lkml.org/lkml/2009/6/6/149. Tested on
24 * x86_64. Larger comments refer to tools/perf/design.txt. Be warned, the stuff
25 * from design.txt, especially data structures are heavily deprecated!
27 * Compile: gcc pstat.c -o pstat -lrt -O2
28 * Patches are welcome! Mail them to <dborkma@tik.ee.ethz.ch>.
29 * - Additions made by Emmanuel Roullit <emmanuel@netsniff-ng.org>
32 * - Tracing another already running pid not yet working! CPU goes up
33 * to 100% and the program never returns.
34 * - Tracing a single event returns in strange numbers! May be because
35 * of group leader settings?
53 #include <sys/syscall.h>
54 #include <sys/types.h>
57 #include <sys/ptrace.h>
58 #include <asm/byteorder.h>
59 #include <linux/types.h>
60 #include <linux/ioctl.h>
66 PERF_TYPE_HARDWARE
= 0,
67 PERF_TYPE_SOFTWARE
= 1,
68 PERF_TYPE_TRACEPOINT
= 2,
69 PERF_TYPE_HW_CACHE
= 3,
71 PERF_TYPE_BREAKPOINT
= 5,
72 PERF_TYPE_MAX
, /* non-ABI */
76 * Generalized performance event event_id types, used by the
77 * attr.event_id parameter of the sys_perf_event_open() syscall:
80 PERF_COUNT_HW_CPU_CYCLES
= 0,
81 PERF_COUNT_HW_INSTRUCTIONS
= 1,
82 PERF_COUNT_HW_CACHE_REFERENCES
= 2,
83 PERF_COUNT_HW_CACHE_MISSES
= 3,
84 PERF_COUNT_HW_BRANCH_INSTRUCTIONS
= 4,
85 PERF_COUNT_HW_BRANCH_MISSES
= 5,
86 PERF_COUNT_HW_BUS_CYCLES
= 6,
87 PERF_COUNT_HW_MAX
, /* non-ABI */
91 * Generalized hardware cache events:
92 * { L1-D, L1-I, LLC, ITLB, DTLB, BPU } x
93 * { read, write, prefetch } x
94 * { accesses, misses }
96 enum perf_hw_cache_id
{
97 PERF_COUNT_HW_CACHE_L1D
= 0,
98 PERF_COUNT_HW_CACHE_L1I
= 1,
99 PERF_COUNT_HW_CACHE_LL
= 2,
100 PERF_COUNT_HW_CACHE_DTLB
= 3,
101 PERF_COUNT_HW_CACHE_ITLB
= 4,
102 PERF_COUNT_HW_CACHE_BPU
= 5,
103 PERF_COUNT_HW_CACHE_MAX
, /* non-ABI */
106 enum perf_hw_cache_op_id
{
107 PERF_COUNT_HW_CACHE_OP_READ
= 0,
108 PERF_COUNT_HW_CACHE_OP_WRITE
= 1,
109 PERF_COUNT_HW_CACHE_OP_PREFETCH
= 2,
110 PERF_COUNT_HW_CACHE_OP_MAX
, /* non-ABI */
113 enum perf_hw_cache_op_result_id
{
114 PERF_COUNT_HW_CACHE_RESULT_ACCESS
= 0,
115 PERF_COUNT_HW_CACHE_RESULT_MISS
= 1,
116 PERF_COUNT_HW_CACHE_RESULT_MAX
, /* non-ABI */
120 * Special "software" events provided by the kernel, even if the hardware
121 * does not support performance events. These events measure various
122 * physical and sw events of the kernel (and allow the profiling of them as
126 PERF_COUNT_SW_CPU_CLOCK
= 0,
127 PERF_COUNT_SW_TASK_CLOCK
= 1,
128 PERF_COUNT_SW_PAGE_FAULTS
= 2,
129 PERF_COUNT_SW_CONTEXT_SWITCHES
= 3,
130 PERF_COUNT_SW_CPU_MIGRATIONS
= 4,
131 PERF_COUNT_SW_PAGE_FAULTS_MIN
= 5,
132 PERF_COUNT_SW_PAGE_FAULTS_MAJ
= 6,
133 PERF_COUNT_SW_ALIGNMENT_FAULTS
= 7,
134 PERF_COUNT_SW_EMULATION_FAULTS
= 8,
135 PERF_COUNT_SW_MAX
, /* non-ABI */
139 * Hardware event_id to monitor via a performance monitoring event:
141 * The 'disabled' bit specifies whether the counter starts out disabled
142 * or enabled. If it is initially disabled, it can be enabled by ioctl
145 * The 'inherit' bit, if set, specifies that this counter should count
146 * events on descendant tasks as well as the task specified. This only
147 * applies to new descendents, not to any existing descendents at the
148 * time the counter is created (nor to any new descendents of existing
151 * The 'pinned' bit, if set, specifies that the counter should always be
152 * on the CPU if at all possible. It only applies to hardware counters
153 * and only to group leaders. If a pinned counter cannot be put onto the
154 * CPU (e.g. because there are not enough hardware counters or because of
155 * a conflict with some other event), then the counter goes into an
156 * 'error' state, where reads return end-of-file (i.e. read() returns 0)
157 * until the counter is subsequently enabled or disabled.
159 * The 'exclusive' bit, if set, specifies that when this counter's group
160 * is on the CPU, it should be the only group using the CPU's counters.
161 * In future, this will allow sophisticated monitoring programs to supply
162 * extra configuration information via 'extra_config_len' to exploit
163 * advanced features of the CPU's Performance Monitor Unit (PMU) that are
164 * not otherwise accessible and that might disrupt other hardware
167 * The 'exclude_user', 'exclude_kernel' and 'exclude_hv' bits provide a
168 * way to request that counting of events be restricted to times when the
169 * CPU is in user, kernel and/or hypervisor mode.
171 struct perf_event_attr
{
173 * Major type: hardware/software/tracepoint/etc.
177 * Size of the attr structure, for fwd/bwd compat.
181 * Type specific configuration information.
190 __u64 disabled
:1, /* off by default */
191 inherit
:1, /* children inherit it */
192 pinned
:1, /* must always be on PMU */
193 exclusive
:1, /* only group on PMU */
194 exclude_user
:1, /* don't count user */
195 exclude_kernel
:1, /* ditto kernel */
196 exclude_hv
:1, /* ditto hypervisor */
197 exclude_idle
:1, /* don't count when idle */
198 mmap
:1, /* include mmap data */
199 comm
:1, /* include comm data */
200 freq
:1, /* use freq, not period */
201 inherit_stat
:1, /* per task counts */
202 enable_on_exec
:1, /* next exec enables */
203 task
:1, /* trace fork/exit */
204 watermark
:1, /* wakeup_watermark */
205 precise_ip
:2, /* skid constraint */
206 mmap_data
:1, /* non-exec mmap data */
209 __u32 wakeup_events
; /* wakeup every n events */
210 __u32 wakeup_watermark
; /* bytes before wakeup */
217 enum perf_event_ioc_flags
{
218 PERF_IOC_FLAG_GROUP
= 1U << 0,
222 * Ioctls that can be done on a perf event fd:
224 #define PERF_EVENT_IOC_ENABLE _IO ('$', 0)
225 #define PERF_EVENT_IOC_DISABLE _IO ('$', 1)
226 #define PERF_EVENT_IOC_REFRESH _IO ('$', 2)
228 #define ARRAY_SIZE(x) (sizeof(x) / sizeof(x[0]))
229 #define MAX_COUNTERS 32
231 #define FDS_INVALID -1
232 #define GRP_INVALID -1
234 #define MODE_KERNEL 1
240 # define likely(x) __builtin_expect(!!(x), 1)
243 # define unlikely(x) __builtin_expect(!!(x), 0)
246 # define bug() __builtin_trap()
249 #define PROGNAME "pstat"
250 #define VERSNAME "0.9"
256 /* Software tracepoints */
257 COUNT_SW_CPU_CLOCK
= 0,
258 COUNT_SW_TASK_CLOCK
= 1,
259 COUNT_SW_CONTEXT_SWITCHES
= 2,
260 COUNT_SW_CPU_MIGRATIONS
= 3,
261 COUNT_SW_PAGE_FAULTS
= 4,
262 COUNT_SW_PAGE_FAULTS_MIN
= 5,
263 COUNT_SW_PAGE_FAULTS_MAJ
= 6,
264 /* Hardware counters */
265 COUNT_HW_CPU_CYCLES
= 7,
266 COUNT_HW_INSTRUCTIONS
= 8,
267 COUNT_HW_CACHE_REFERENCES
= 9,
268 COUNT_HW_CACHE_MISSES
= 10,
269 COUNT_HW_BRANCH_INSTRUCTIONS
= 11,
270 COUNT_HW_BRANCH_MISSES
= 12,
271 COUNT_HW_BUS_CYCLES
= 13,
273 /* L1D - data cache */
274 COUNT_HW_CACHE_L1D_LOADS
= 14,
275 COUNT_HW_CACHE_L1D_LOADS_MISSES
= 15,
276 COUNT_HW_CACHE_L1D_STORES
= 16,
277 COUNT_HW_CACHE_L1D_STORES_MISSES
= 17,
278 COUNT_HW_CACHE_L1D_PREFETCHES
= 18,
279 /* L1I - Instruction cache */
280 COUNT_HW_CACHE_L1I_LOADS
= 19,
281 COUNT_HW_CACHE_L1I_LOADS_MISSES
= 20,
282 /* LL - Last level cache */
283 COUNT_HW_CACHE_LL_LOADS
= 21,
284 COUNT_HW_CACHE_LL_LOADS_MISSES
= 22,
285 COUNT_HW_CACHE_LL_STORES
= 23,
286 COUNT_HW_CACHE_LL_STORES_MISSES
= 24,
287 /* DTLB - Data translation lookaside buffer */
288 COUNT_HW_CACHE_DTLB_LOADS
= 25,
289 COUNT_HW_CACHE_DTLB_LOADS_MISSES
= 26,
290 COUNT_HW_CACHE_DTLB_STORES
= 27,
291 COUNT_HW_CACHE_DTLB_STORES_MISSES
= 28,
292 /* ITLB - Instructiont translation lookaside buffer */
293 COUNT_HW_CACHE_ITLB_LOADS
= 29,
294 COUNT_HW_CACHE_ITLB_LOADS_MISSES
= 30,
295 /* BPU - Branch prediction unit */
296 COUNT_HW_CACHE_BPU_LOADS
= 31,
297 COUNT_HW_CACHE_BPU_LOADS_MISSES
= 32,
299 INTERNAL_SW_WALL_TIME
= 33,
300 INTERNAL_INVALID_TP
= 34
305 /* char *description; */
306 enum tracepoint tracepoint
;
309 #define TRACE_MAP_SET(x) \
315 struct trace_map whole_map
[] = {
316 TRACE_MAP_SET(COUNT_SW_CPU_CLOCK
),
317 TRACE_MAP_SET(COUNT_SW_TASK_CLOCK
),
318 TRACE_MAP_SET(COUNT_SW_CONTEXT_SWITCHES
),
319 TRACE_MAP_SET(COUNT_SW_CPU_MIGRATIONS
),
320 TRACE_MAP_SET(COUNT_SW_PAGE_FAULTS
),
321 TRACE_MAP_SET(COUNT_SW_PAGE_FAULTS_MIN
),
322 TRACE_MAP_SET(COUNT_SW_PAGE_FAULTS_MAJ
),
323 TRACE_MAP_SET(COUNT_HW_CPU_CYCLES
),
324 TRACE_MAP_SET(COUNT_HW_INSTRUCTIONS
),
325 TRACE_MAP_SET(COUNT_HW_CACHE_REFERENCES
),
326 TRACE_MAP_SET(COUNT_HW_CACHE_MISSES
),
327 TRACE_MAP_SET(COUNT_HW_BRANCH_INSTRUCTIONS
),
328 TRACE_MAP_SET(COUNT_HW_BRANCH_MISSES
),
329 TRACE_MAP_SET(COUNT_HW_BUS_CYCLES
),
330 TRACE_MAP_SET(COUNT_HW_CACHE_L1D_LOADS
),
331 TRACE_MAP_SET(COUNT_HW_CACHE_L1D_LOADS_MISSES
),
332 TRACE_MAP_SET(COUNT_HW_CACHE_L1D_STORES
),
333 TRACE_MAP_SET(COUNT_HW_CACHE_L1D_STORES_MISSES
),
334 TRACE_MAP_SET(COUNT_HW_CACHE_L1D_PREFETCHES
),
335 TRACE_MAP_SET(COUNT_HW_CACHE_L1I_LOADS
),
336 TRACE_MAP_SET(COUNT_HW_CACHE_L1I_LOADS_MISSES
),
337 TRACE_MAP_SET(COUNT_HW_CACHE_LL_LOADS
),
338 TRACE_MAP_SET(COUNT_HW_CACHE_LL_LOADS_MISSES
),
339 TRACE_MAP_SET(COUNT_HW_CACHE_LL_STORES
),
340 TRACE_MAP_SET(COUNT_HW_CACHE_LL_STORES_MISSES
),
341 TRACE_MAP_SET(COUNT_HW_CACHE_ITLB_LOADS
),
342 TRACE_MAP_SET(COUNT_HW_CACHE_ITLB_LOADS_MISSES
),
343 TRACE_MAP_SET(COUNT_HW_CACHE_BPU_LOADS
),
344 TRACE_MAP_SET(COUNT_HW_CACHE_BPU_LOADS_MISSES
),
345 TRACE_MAP_SET(INTERNAL_SW_WALL_TIME
),
346 TRACE_MAP_SET(INTERNAL_INVALID_TP
)
349 static struct perf_event_attr default_attrs
[] = {
350 { /* Software attributes */
351 .type
= PERF_TYPE_SOFTWARE
,
352 .config
= PERF_COUNT_SW_CPU_CLOCK
354 .type
= PERF_TYPE_SOFTWARE
,
355 .config
= PERF_COUNT_SW_TASK_CLOCK
357 .type
= PERF_TYPE_SOFTWARE
,
358 .config
= PERF_COUNT_SW_CONTEXT_SWITCHES
360 .type
= PERF_TYPE_SOFTWARE
,
361 .config
= PERF_COUNT_SW_CPU_MIGRATIONS
363 .type
= PERF_TYPE_SOFTWARE
,
364 .config
= PERF_COUNT_SW_PAGE_FAULTS
366 .type
= PERF_TYPE_SOFTWARE
,
367 .config
= PERF_COUNT_SW_PAGE_FAULTS_MIN
369 .type
= PERF_TYPE_SOFTWARE
,
370 .config
= PERF_COUNT_SW_PAGE_FAULTS_MAJ
371 }, { /* Hardware attributes */
372 .type
= PERF_TYPE_HARDWARE
,
373 .config
= PERF_COUNT_HW_CPU_CYCLES
375 .type
= PERF_TYPE_HARDWARE
,
376 .config
= PERF_COUNT_HW_INSTRUCTIONS
378 .type
= PERF_TYPE_HARDWARE
,
379 .config
= PERF_COUNT_HW_CACHE_REFERENCES
381 .type
= PERF_TYPE_HARDWARE
,
382 .config
= PERF_COUNT_HW_CACHE_MISSES
384 .type
= PERF_TYPE_HARDWARE
,
385 .config
= PERF_COUNT_HW_BRANCH_INSTRUCTIONS
387 .type
= PERF_TYPE_HARDWARE
,
388 .config
= PERF_COUNT_HW_BRANCH_MISSES
390 .type
= PERF_TYPE_HARDWARE
,
391 .config
= PERF_COUNT_HW_BUS_CYCLES
392 }, { /* Caching attributes */
393 .type
= PERF_TYPE_HW_CACHE
,
394 .config
= ((PERF_COUNT_HW_CACHE_L1D
<< 0) |
395 (PERF_COUNT_HW_CACHE_OP_READ
<< 8) |
396 (PERF_COUNT_HW_CACHE_RESULT_ACCESS
<< 16))
398 .type
= PERF_TYPE_HW_CACHE
,
399 .config
= ((PERF_COUNT_HW_CACHE_L1D
<< 0) |
400 (PERF_COUNT_HW_CACHE_OP_READ
<< 8) |
401 (PERF_COUNT_HW_CACHE_RESULT_MISS
<< 16))
403 .type
= PERF_TYPE_HW_CACHE
,
404 .config
= ((PERF_COUNT_HW_CACHE_L1D
<< 0) |
405 (PERF_COUNT_HW_CACHE_OP_WRITE
<< 8) |
406 (PERF_COUNT_HW_CACHE_RESULT_ACCESS
<< 16))
408 .type
= PERF_TYPE_HW_CACHE
,
409 .config
= ((PERF_COUNT_HW_CACHE_L1D
<< 0) |
410 (PERF_COUNT_HW_CACHE_OP_WRITE
<< 8) |
411 (PERF_COUNT_HW_CACHE_RESULT_MISS
<< 16))
413 .type
= PERF_TYPE_HW_CACHE
,
414 .config
= ((PERF_COUNT_HW_CACHE_L1D
<< 0) |
415 (PERF_COUNT_HW_CACHE_OP_PREFETCH
<< 8) |
416 (PERF_COUNT_HW_CACHE_RESULT_ACCESS
<< 16))
418 .type
= PERF_TYPE_HW_CACHE
,
419 .config
= ((PERF_COUNT_HW_CACHE_L1I
<< 0) |
420 (PERF_COUNT_HW_CACHE_OP_READ
<< 8) |
421 (PERF_COUNT_HW_CACHE_RESULT_ACCESS
<< 16))
423 .type
= PERF_TYPE_HW_CACHE
,
424 .config
= ((PERF_COUNT_HW_CACHE_L1I
<< 0) |
425 (PERF_COUNT_HW_CACHE_OP_READ
<< 8) |
426 (PERF_COUNT_HW_CACHE_RESULT_MISS
<< 16))
428 .type
= PERF_TYPE_HW_CACHE
,
429 .config
= ((PERF_COUNT_HW_CACHE_LL
<< 0) |
430 (PERF_COUNT_HW_CACHE_OP_READ
<< 8) |
431 (PERF_COUNT_HW_CACHE_RESULT_ACCESS
<< 16))
433 .type
= PERF_TYPE_HW_CACHE
,
434 .config
= ((PERF_COUNT_HW_CACHE_LL
<< 0) |
435 (PERF_COUNT_HW_CACHE_OP_READ
<< 8) |
436 (PERF_COUNT_HW_CACHE_RESULT_MISS
<< 16))
438 .type
= PERF_TYPE_HW_CACHE
,
439 .config
= ((PERF_COUNT_HW_CACHE_LL
<< 0) |
440 (PERF_COUNT_HW_CACHE_OP_WRITE
<< 8) |
441 (PERF_COUNT_HW_CACHE_RESULT_ACCESS
<< 16))
443 .type
= PERF_TYPE_HW_CACHE
,
444 .config
= ((PERF_COUNT_HW_CACHE_LL
<< 0) |
445 (PERF_COUNT_HW_CACHE_OP_WRITE
<< 8) |
446 (PERF_COUNT_HW_CACHE_RESULT_MISS
<< 16))
448 .type
= PERF_TYPE_HW_CACHE
,
449 .config
= ((PERF_COUNT_HW_CACHE_DTLB
<< 0) |
450 (PERF_COUNT_HW_CACHE_OP_READ
<< 8) |
451 (PERF_COUNT_HW_CACHE_RESULT_ACCESS
<< 16))
453 .type
= PERF_TYPE_HW_CACHE
,
454 .config
= ((PERF_COUNT_HW_CACHE_DTLB
<< 0) |
455 (PERF_COUNT_HW_CACHE_OP_READ
<< 8) |
456 (PERF_COUNT_HW_CACHE_RESULT_MISS
<< 16))
458 .type
= PERF_TYPE_HW_CACHE
,
459 .config
= ((PERF_COUNT_HW_CACHE_DTLB
<< 0) |
460 (PERF_COUNT_HW_CACHE_OP_WRITE
<< 8) |
461 (PERF_COUNT_HW_CACHE_RESULT_ACCESS
<< 16))
463 .type
= PERF_TYPE_HW_CACHE
,
464 .config
= ((PERF_COUNT_HW_CACHE_DTLB
<< 0) |
465 (PERF_COUNT_HW_CACHE_OP_WRITE
<< 8) |
466 (PERF_COUNT_HW_CACHE_RESULT_MISS
<< 16))
468 .type
= PERF_TYPE_HW_CACHE
,
469 .config
= ((PERF_COUNT_HW_CACHE_ITLB
<< 0) |
470 (PERF_COUNT_HW_CACHE_OP_READ
<< 8) |
471 (PERF_COUNT_HW_CACHE_RESULT_ACCESS
<< 16))
473 .type
= PERF_TYPE_HW_CACHE
,
474 .config
= ((PERF_COUNT_HW_CACHE_ITLB
<< 0) |
475 (PERF_COUNT_HW_CACHE_OP_READ
<< 8) |
476 (PERF_COUNT_HW_CACHE_RESULT_MISS
<< 16))
478 .type
= PERF_TYPE_HW_CACHE
,
479 .config
= ((PERF_COUNT_HW_CACHE_BPU
<< 0) |
480 (PERF_COUNT_HW_CACHE_OP_READ
<< 8) |
481 (PERF_COUNT_HW_CACHE_RESULT_ACCESS
<< 16))
483 .type
= PERF_TYPE_HW_CACHE
,
484 .config
= ((PERF_COUNT_HW_CACHE_BPU
<< 0) |
485 (PERF_COUNT_HW_CACHE_OP_READ
<< 8) |
486 (PERF_COUNT_HW_CACHE_RESULT_MISS
<< 16))
492 static sig_atomic_t sigint
= 0;
494 static const char *short_options
= "p:c:ekuyvhlx:i";
496 static struct option long_options
[] = {
497 {"pid", required_argument
, 0, 'p'},
498 {"cpu", required_argument
, 0, 'c'},
499 {"use", required_argument
, 0, 'x'},
500 {"excl", no_argument
, 0, 'e'},
501 {"kernel", no_argument
, 0, 'k'},
502 {"user", no_argument
, 0, 'u'},
503 {"hyper", no_argument
, 0, 'y'},
504 {"idle", no_argument
, 0, 'i'},
505 {"list", no_argument
, 0, 'l'},
506 {"version", no_argument
, 0, 'v'},
507 {"help", no_argument
, 0, 'h'},
515 int fds
[MAX_COUNTERS
];
516 struct perf_event_attr
*attrs
;
517 unsigned long long wall_start
;
520 static inline void die(void)
525 static void usage(void)
527 printf("\n%s %s\n", PROGNAME
, VERSNAME
);
528 printf("Usage: %s [options] [<cmd>]\n", PROGNAME
);
529 printf("Options:\n");
530 printf(" -p|--pid <pid> Attach to running process/kthread\n");
531 printf(" -c|--cpu <cpu> Bind counter to cpuid\n");
532 printf(" -e|--excl Be exclusive counter group on CPU\n");
533 printf(" -k|--kernel Count events in kernel mode\n");
534 printf(" -u|--user Count events in user mode\n");
535 printf(" -y|--hyper Count events in hypervisor mode\n");
536 printf(" -i|--idle Do also count when idle\n");
537 printf(" -l|--list List possible events\n");
538 printf(" -x|--use <event> Count only a certain event\n");
539 printf(" -v|--version Print version\n");
540 printf(" -h|--help Print this help\n");
542 printf("Please report bugs to <dborkma@tik.ee.ethz.ch>\n");
543 printf("Copyright (C) 2011 Daniel Borkmann\n");
544 printf("License: GNU GPL version 2\n");
545 printf("This is free software: you are free to change and redistribute it.\n");
546 printf("There is NO WARRANTY, to the extent permitted by law.\n\n");
551 static void version(void)
553 printf("\n%s %s\n", PROGNAME
, VERSNAME
);
554 printf("Please report bugs to <dborkma@tik.ee.ethz.ch>\n");
555 printf("Copyright (C) 2011 Daniel Borkmann\n");
556 printf("License: GNU GPL version 2\n");
557 printf("This is free software: you are free to change and redistribute it.\n");
558 printf("There is NO WARRANTY, to the extent permitted by law.\n\n");
563 static void reaper(int sig
)
567 while ((pid
= waitpid(-1, &status
, WNOHANG
)) > 0)
571 static void intr(int sig
)
576 static inline void register_signal(int signal
, void (*handler
)(int))
579 struct sigaction saction
;
581 sigfillset(&block_mask
);
582 saction
.sa_handler
= handler
;
583 saction
.sa_mask
= block_mask
;
584 saction
.sa_flags
= SA_RESTART
;
586 sigaction(signal
, &saction
, NULL
);
589 static inline unsigned long long rdclock(void)
593 clock_gettime(CLOCK_MONOTONIC
, &ts
);
594 return ts
.tv_sec
* 1000000000ULL + ts
.tv_nsec
;
597 static inline void panic(char *msg
, ...)
601 vfprintf(stderr
, msg
, vl
);
607 static inline void whine(char *msg
, ...)
611 vfprintf(stderr
, msg
, vl
);
615 static void *xzmalloc(size_t size
)
619 if (unlikely(size
== 0))
620 panic("xzmalloc: zero size\n");
623 if (unlikely(ptr
== NULL
))
624 panic("xzmalloc: out of memory (allocating %lu bytes)\n",
626 memset(ptr
, 0, size
);
631 static void xfree(void *ptr
)
633 if (unlikely(ptr
== NULL
))
634 panic("xfree: NULL pointer given as argument\n");
639 * The 'group_fd' parameter allows counter "groups" to be set up. A
640 * counter group has one counter which is the group "leader". The leader
641 * is created first, with group_fd = -1 in the perf_event_open call
642 * that creates it. The rest of the group members are created
643 * subsequently, with group_fd giving the fd of the group leader.
644 * (A single counter on its own is created with group_fd = -1 and is
645 * considered to be a group with only 1 member.)
647 * A counter group is scheduled onto the CPU as a unit, that is, it will
648 * only be put onto the CPU if all of the counters in the group can be
649 * put onto the CPU. This means that the values of the member counters
650 * can be meaningfully compared, added, divided (to get ratios), etc.,
651 * with each other, since they have counted events for the same set of
652 * executed instructions.
654 static inline int sys_perf_event_open(struct perf_event_attr
*attr
, pid_t pid
,
655 int cpu
, int group_fd
,
660 * pid == 0: counter attached to current task
661 * pid > 0: counter attached to specific task
662 * pid < 0: counter attached to all tasks
664 * cpu >= 0: counter restricted to a specific CPU
665 * cpu == -1: counter counts on all CPUs
666 * User/kernel/hypervisor modes:
667 * See attr bits for excluding stuff!
668 * Note: pid == -1 && cpu == -1 is invalid!
671 attr
->size
= sizeof(*attr
);
672 return syscall(__NR_perf_event_open
, attr
, pid
, cpu
, group_fd
, flags
);
675 static inline pid_t
gettid()
677 return syscall(SYS_gettid
);
680 static struct perf_data
*initialize(pid_t pid
, int cpu
, int mode
, int excl
)
683 struct perf_data
*pd
;
684 struct perf_event_attr
*attr
;
685 struct perf_event_attr
*attrs
;
687 pd
= xzmalloc(sizeof(*pd
));
690 pd
->group
= GRP_INVALID
;
691 for (i
= 0; i
< ARRAY_SIZE(pd
->fds
); i
++)
692 pd
->fds
[i
] = FDS_INVALID
;
696 attrs
= xzmalloc(sizeof(*attrs
) * ARRAY_SIZE(default_attrs
));
697 memcpy(attrs
, default_attrs
, sizeof(default_attrs
));
700 for (i
= 0; i
< ARRAY_SIZE(default_attrs
); i
++) {
705 attr
->enable_on_exec
= 0;
706 attr
->exclusive
= excl
;
707 attr
->exclude_user
= ((mode
& MODE_USER
) == 0);
708 attr
->exclude_kernel
= ((mode
& MODE_KERNEL
) == 0);
709 attr
->exclude_hv
= ((mode
& MODE_HYPER
) == 0);
710 attr
->exclude_idle
= ((mode
& MODE_IDLE
) == 0);
711 /* pd->fds[0] is counter group leader! */
712 pd
->fds
[i
] = sys_perf_event_open(attr
, pid
, cpu
,
713 i
== 0 ? GRP_INVALID
: pd
->fds
[0],
714 PERF_IOC_FLAG_GROUP
);
715 if (unlikely(pd
->fds
[i
] < 0))
716 panic("sys_perf_event_open failed: %s\n", strerror(errno
));
719 pd
->group
= pd
->fds
[0];
724 * A read() on a counter returns the current value of the counter and possible
725 * additional values as specified by 'read_format', each value is a u64 (8 bytes)
728 static uint64_t read_counter(struct perf_data
*pd
, int counter
)
733 if (counter
== INTERNAL_SW_WALL_TIME
)
734 return (uint64_t) (rdclock() - pd
->wall_start
);
735 if (unlikely(counter
< 0 || counter
> MAX_COUNTERS
))
736 panic("bug! invalid counter value!\n");
738 ret
= read(pd
->fds
[counter
], &value
, sizeof(uint64_t));
739 if (unlikely(ret
!= sizeof(uint64_t)))
740 panic("perf_counter read error!\n");
746 * Counters can be enabled and disabled in two ways: via ioctl and via
747 * prctl. When a counter is disabled, it doesn't count or generate
748 * events but does continue to exist and maintain its count value.
750 * Enabling or disabling the leader of a group enables or disables the
751 * whole group; that is, while the group leader is disabled, none of the
752 * counters in the group will count. Enabling or disabling a member of a
753 * group other than the leader only affects that counter - disabling an
754 * non-leader stops that counter from counting but doesn't affect any
757 static void enable_counter(struct perf_data
*pd
, int counter
)
761 if (unlikely(counter
< 0 || counter
>= MAX_COUNTERS
))
762 panic("bug! invalid counter value!\n");
763 if (pd
->fds
[counter
] == FDS_INVALID
) {
764 pd
->fds
[counter
] = sys_perf_event_open(&pd
->attrs
[counter
],
767 PERF_IOC_FLAG_GROUP
);
768 if (unlikely(pd
->fds
[counter
] < 0))
769 panic("sys_perf_event_open failed!\n");
772 ret
= ioctl(pd
->fds
[counter
], PERF_EVENT_IOC_ENABLE
);
774 panic("error enabling perf counter!\n");
776 pd
->wall_start
= rdclock();
779 static void enable_all_counter(struct perf_data
*pd
)
783 for (i
= 0; i
< MAX_COUNTERS
; i
++) {
784 enable_counter(pd
, i
);
787 /* XXX: Only group leader? */
789 for (i
= 0; i
< MAX_COUNTERS
; i
++) {
790 /* ret = ioctl(pd->group, PERF_EVENT_IOC_ENABLE); */
791 ret
= ioctl(pd
->fds
[i
], PERF_EVENT_IOC_ENABLE
);
793 panic("error enabling perf counter!\n");
796 pd
->wall_start
= rdclock();
799 static void disable_counter(struct perf_data
*pd
, int counter
)
803 if (unlikely(counter
< 0 || counter
>= MAX_COUNTERS
))
804 panic("bug! invalid counter value!\n");
805 if (pd
->fds
[counter
] == FDS_INVALID
)
808 ret
= ioctl(pd
->fds
[counter
], PERF_EVENT_IOC_DISABLE
);
810 panic("error disabling perf counter!\n");
813 static void disable_all_counter(struct perf_data
*pd
)
817 /* XXX: Only group leader? */
818 for (i
= 0; i
< MAX_COUNTERS
; i
++) {
819 disable_counter(pd
, i
);
823 static void cleanup(struct perf_data
*pd
)
827 for (i
= 0; i
< ARRAY_SIZE(default_attrs
); i
++)
834 static void list_counter(void)
838 for (i
= 0; i
< ARRAY_SIZE(whole_map
); i
++)
839 printf("%s\n", whole_map
[i
].name
);
844 static enum tracepoint
lookup_counter(char *name
)
848 for (i
= 0; i
< ARRAY_SIZE(whole_map
); i
++)
849 if (!strncmp(whole_map
[i
].name
, name
, sizeof(whole_map
[i
].name
) - 1))
850 return whole_map
[i
].tracepoint
;
851 return INTERNAL_INVALID_TP
;
854 static void print_whole_result(struct perf_data
*pd
)
858 printf("Software counters:\n");
859 printf(" CPU clock ticks %" PRIu64
"\n", read_counter(pd
, COUNT_SW_CPU_CLOCK
));
860 printf(" task clock ticks %" PRIu64
"\n", read_counter(pd
, COUNT_SW_TASK_CLOCK
));
861 printf(" CPU context switches %" PRIu64
"\n", read_counter(pd
, COUNT_SW_CONTEXT_SWITCHES
));
862 printf(" CPU migrations %" PRIu64
"\n", read_counter(pd
, COUNT_SW_CPU_MIGRATIONS
));
863 printf(" pagefaults/minor/major %" PRIu64
"/%" PRIu64
"/%" PRIu64
"\n",
864 read_counter(pd
, COUNT_SW_PAGE_FAULTS
),
865 read_counter(pd
, COUNT_SW_PAGE_FAULTS_MIN
),
866 read_counter(pd
, COUNT_SW_PAGE_FAULTS_MAJ
));
867 printf("Hardware counters:\n");
868 printf(" CPU cycles %" PRIu64
"\n", read_counter(pd
, COUNT_HW_CPU_CYCLES
));
869 printf(" instructions %" PRIu64
"\n", read_counter(pd
, COUNT_HW_INSTRUCTIONS
));
870 tmp1
= read_counter(pd
, COUNT_HW_CACHE_REFERENCES
);
871 tmp2
= read_counter(pd
, COUNT_HW_CACHE_MISSES
);
872 printf(" cache references %" PRIu64
"\n", tmp1
);
873 printf(" cache misses (rate) %" PRIu64
" (%.4lf %%)\n", tmp2
, (1.0 * tmp2
/ tmp1
) * 100.0);
874 tmp1
= read_counter(pd
, COUNT_HW_BRANCH_INSTRUCTIONS
);
875 tmp2
= read_counter(pd
, COUNT_HW_BRANCH_MISSES
);
876 printf(" branch instructions %" PRIu64
"\n", tmp1
);
877 printf(" branch misses (rate) %" PRIu64
" (%.4lf %%)\n", tmp2
, (1.0 * tmp2
/ tmp1
) * 100.0);
878 printf(" bus cycles %" PRIu64
"\n", read_counter(pd
, COUNT_HW_BUS_CYCLES
));
879 printf("L1D, data cache:\n");
880 printf(" loads %" PRIu64
"\n", read_counter(pd
, COUNT_HW_CACHE_L1D_LOADS
));
881 printf(" load misses %" PRIu64
"\n", read_counter(pd
, COUNT_HW_CACHE_L1D_LOADS_MISSES
));
882 printf(" stores %" PRIu64
"\n", read_counter(pd
, COUNT_HW_CACHE_L1D_STORES
));
883 printf(" store misses %" PRIu64
"\n", read_counter(pd
, COUNT_HW_CACHE_L1D_STORES_MISSES
));
884 printf(" prefetches %" PRIu64
"\n", read_counter(pd
, COUNT_HW_CACHE_L1D_PREFETCHES
));
885 printf("L1I, instruction cache:\n");
886 printf(" loads %" PRIu64
"\n", read_counter(pd
, COUNT_HW_CACHE_L1I_LOADS
));
887 printf(" load misses %" PRIu64
"\n", read_counter(pd
, COUNT_HW_CACHE_L1I_LOADS_MISSES
));
888 printf("LL, last level cache:\n");
889 printf(" loads %" PRIu64
"\n", read_counter(pd
, COUNT_HW_CACHE_LL_LOADS
));
890 printf(" load misses %" PRIu64
"\n", read_counter(pd
, COUNT_HW_CACHE_LL_LOADS_MISSES
));
891 printf(" stores %" PRIu64
"\n", read_counter(pd
, COUNT_HW_CACHE_LL_STORES
));
892 printf(" store misses %" PRIu64
"\n", read_counter(pd
, COUNT_HW_CACHE_LL_STORES_MISSES
));
893 printf("DTLB, data translation lookaside buffer:\n");
894 printf(" loads %" PRIu64
"\n", read_counter(pd
, COUNT_HW_CACHE_DTLB_LOADS
));
895 printf(" load misses %" PRIu64
"\n", read_counter(pd
, COUNT_HW_CACHE_DTLB_LOADS_MISSES
));
896 printf(" stores %" PRIu64
"\n", read_counter(pd
, COUNT_HW_CACHE_DTLB_STORES
));
897 printf(" store misses %" PRIu64
"\n", read_counter(pd
, COUNT_HW_CACHE_DTLB_STORES_MISSES
));
898 printf("ILLB, instruction translation lookaside buffer:\n");
899 printf(" loads %" PRIu64
"\n", read_counter(pd
, COUNT_HW_CACHE_ITLB_LOADS
));
900 printf(" load misses %" PRIu64
"\n", read_counter(pd
, COUNT_HW_CACHE_ITLB_LOADS_MISSES
));
901 printf("BPU, branch prediction unit:\n");
902 printf(" loads %" PRIu64
"\n", read_counter(pd
, COUNT_HW_CACHE_BPU_LOADS
));
903 printf(" load misses %" PRIu64
"\n", read_counter(pd
, COUNT_HW_CACHE_BPU_LOADS_MISSES
));
904 printf("Wall-clock time elapsed:\n");
905 printf(" usec %" PRIu64
"\n", read_counter(pd
, INTERNAL_SW_WALL_TIME
));
908 int main(int argc
, char **argv
)
910 int status
, c
, opt_index
, mode
, pt
, cpu
, excl
, ret
;
913 struct perf_data
*pd
;
919 cpus
= sysconf(_SC_NPROCESSORS_ONLN
);
921 mode
= excl
= pt
= 0;
922 tp
= INTERNAL_INVALID_TP
;
924 while ((c
= getopt_long(argc
, argv
, short_options
, long_options
,
925 &opt_index
)) != EOF
) {
936 panic("bad pid! either 0 for all procs "
942 whine("not yet working correctly!\n");
946 if (cpu
< 0 || cpu
>= cpus
)
947 panic("bad cpuid! needs to be 0 <= x < "
969 tp
= lookup_counter(optarg
);
970 printf("found: %d\n", tp
);
978 if (pt
&& pid
== -1 && cpu
== -1)
979 panic("either all procs on a single core or all cpus on a "
980 "single proc, but not both!\n");
983 mode
= MODE_KERNEL
| MODE_USER
| MODE_HYPER
;
988 register_signal(SIGCHLD
, reaper
);
989 ret
= ptrace(PT_ATTACH
, pid
, (char *) 1, 0);
991 panic("cannot attach to process!\n");
994 fprintf(stderr
, "Process %u attached - interrupt to quit\n",
998 pd
= initialize(pid
, cpu
, mode
, excl
);
999 if (tp
== INTERNAL_INVALID_TP
)
1000 enable_all_counter(pd
);
1002 enable_counter(pd
, tp
);
1005 execvp(argv
[optind
], &argv
[optind
]);
1008 register_signal(SIGINT
, intr
);
1010 if (tp
== INTERNAL_INVALID_TP
)
1011 disable_all_counter(pd
);
1013 disable_counter(pd
, tp
);
1016 ret
= ptrace(PT_DETACH
, pid
, (char *) 1, SIGCONT
);
1018 panic("cannot detach from process!\n");
1021 fprintf(stderr
, "Process %u detached\n", pid
);
1025 printf("CPU: all, PID: %d\n", pid
);
1027 printf("CPU: %d, PID: %d\n", cpu
, pid
);
1028 printf("Kernel: %s, User: %s, Hypervisor: %s\n",
1029 (mode
& MODE_KERNEL
) == MODE_KERNEL
? "on" : "off",
1030 (mode
& MODE_USER
) == MODE_USER
? "on" : "off",
1031 (mode
& MODE_HYPER
) == MODE_HYPER
? "on" : "off");
1033 if (tp
== INTERNAL_INVALID_TP
)
1034 print_whole_result(pd
);
1036 printf("%" PRIu64
" in %" PRIu64
" usec\n",
1037 read_counter(pd
, tp
),
1038 read_counter(pd
, INTERNAL_SW_WALL_TIME
));