added proc file
[ana-net.git] / opt / pstat.c
blobd80ca70e9b419e42f03d299551f68b8f8f1a9fa1
1 /*
2 * pstat - Linux performance counter subsystem uspace or kthread tracing
4 * Borrowed some code from libperf, which has been written by
5 * Copyright 2010 Wolfgang Richter <wolf@cs.cmu.edu>
6 * Copyright 2010 Ekaterina Taralova <etaralova@cs.cmu.edu>
7 * Copyright 2010 Karl Naden <kbn@cs.cmu.edu>
8 * Subject to the GPL.
10 * Performance events, data type definitions, declarations by
11 * Copyright 2008-2009 Thomas Gleixner <tglx@linutronix.de>
12 * Copyright 2008-2009 Ingo Molnar <mingo@redhat.com>
13 * Copyright 2008-2009 Peter Zijlstra <pzijlstr@redhat.com>
14 * Copyright 2009 Paul Mackerras <paulus@au1.ibm.com>
15 * Subject to the GPL / see COPYING.
17 * pstat has been written by
18 * Copyright 2011 Daniel Borkmann <dborkma@tik.ee.ethz.ch>
19 * Swiss federal institute of technology (ETH Zurich)
20 * Subject to the GPL.
22 * Needs Linux kernel >= 2.6.32. For more detailed information have a look at
23 * tools/perf/design.txt and http://lkml.org/lkml/2009/6/6/149. Tested on
24 * x86_64. Larger comments refer to tools/perf/design.txt. Be warned, the stuff
25 * from design.txt, especially data structures are heavily deprecated!
27 * Compile: gcc pstat.c -o pstat -lrt -O2
28 * Patches are welcome! Mail them to <dborkma@tik.ee.ethz.ch>.
29 * - Additions made by Emmanuel Roullit <emmanuel@netsniff-ng.org>
31 * Not yet working:
32 * - Tracing another already running pid not yet working! CPU goes up
33 * to 100% and the program never returns.
34 * - Tracing a single event returns in strange numbers! May be because
35 * of group leader settings?
38 #include <stdio.h>
39 #include <stdint.h>
40 #include <assert.h>
41 #include <fcntl.h>
42 #include <inttypes.h>
43 #include <math.h>
44 #include <getopt.h>
45 #include <stdlib.h>
46 #include <string.h>
47 #include <stropts.h>
48 #include <time.h>
49 #include <stdarg.h>
50 #include <unistd.h>
51 #include <signal.h>
52 #include <errno.h>
53 #include <sys/syscall.h>
54 #include <sys/types.h>
55 #include <sys/stat.h>
56 #include <sys/wait.h>
57 #include <sys/ptrace.h>
58 #include <asm/byteorder.h>
59 #include <linux/types.h>
60 #include <linux/ioctl.h>
63 * Attribute type
65 enum perf_type_id {
66 PERF_TYPE_HARDWARE = 0,
67 PERF_TYPE_SOFTWARE = 1,
68 PERF_TYPE_TRACEPOINT = 2,
69 PERF_TYPE_HW_CACHE = 3,
70 PERF_TYPE_RAW = 4,
71 PERF_TYPE_BREAKPOINT = 5,
72 PERF_TYPE_MAX, /* non-ABI */
76 * Generalized performance event event_id types, used by the
77 * attr.event_id parameter of the sys_perf_event_open() syscall:
79 enum perf_hw_id {
80 PERF_COUNT_HW_CPU_CYCLES = 0,
81 PERF_COUNT_HW_INSTRUCTIONS = 1,
82 PERF_COUNT_HW_CACHE_REFERENCES = 2,
83 PERF_COUNT_HW_CACHE_MISSES = 3,
84 PERF_COUNT_HW_BRANCH_INSTRUCTIONS = 4,
85 PERF_COUNT_HW_BRANCH_MISSES = 5,
86 PERF_COUNT_HW_BUS_CYCLES = 6,
87 PERF_COUNT_HW_MAX, /* non-ABI */
91 * Generalized hardware cache events:
92 * { L1-D, L1-I, LLC, ITLB, DTLB, BPU } x
93 * { read, write, prefetch } x
94 * { accesses, misses }
96 enum perf_hw_cache_id {
97 PERF_COUNT_HW_CACHE_L1D = 0,
98 PERF_COUNT_HW_CACHE_L1I = 1,
99 PERF_COUNT_HW_CACHE_LL = 2,
100 PERF_COUNT_HW_CACHE_DTLB = 3,
101 PERF_COUNT_HW_CACHE_ITLB = 4,
102 PERF_COUNT_HW_CACHE_BPU = 5,
103 PERF_COUNT_HW_CACHE_MAX, /* non-ABI */
106 enum perf_hw_cache_op_id {
107 PERF_COUNT_HW_CACHE_OP_READ = 0,
108 PERF_COUNT_HW_CACHE_OP_WRITE = 1,
109 PERF_COUNT_HW_CACHE_OP_PREFETCH = 2,
110 PERF_COUNT_HW_CACHE_OP_MAX, /* non-ABI */
113 enum perf_hw_cache_op_result_id {
114 PERF_COUNT_HW_CACHE_RESULT_ACCESS = 0,
115 PERF_COUNT_HW_CACHE_RESULT_MISS = 1,
116 PERF_COUNT_HW_CACHE_RESULT_MAX, /* non-ABI */
120 * Special "software" events provided by the kernel, even if the hardware
121 * does not support performance events. These events measure various
122 * physical and sw events of the kernel (and allow the profiling of them as
123 * well):
125 enum perf_sw_ids {
126 PERF_COUNT_SW_CPU_CLOCK = 0,
127 PERF_COUNT_SW_TASK_CLOCK = 1,
128 PERF_COUNT_SW_PAGE_FAULTS = 2,
129 PERF_COUNT_SW_CONTEXT_SWITCHES = 3,
130 PERF_COUNT_SW_CPU_MIGRATIONS = 4,
131 PERF_COUNT_SW_PAGE_FAULTS_MIN = 5,
132 PERF_COUNT_SW_PAGE_FAULTS_MAJ = 6,
133 PERF_COUNT_SW_ALIGNMENT_FAULTS = 7,
134 PERF_COUNT_SW_EMULATION_FAULTS = 8,
135 PERF_COUNT_SW_MAX, /* non-ABI */
139 * Hardware event_id to monitor via a performance monitoring event:
141 * The 'disabled' bit specifies whether the counter starts out disabled
142 * or enabled. If it is initially disabled, it can be enabled by ioctl
143 * or prctl.
145 * The 'inherit' bit, if set, specifies that this counter should count
146 * events on descendant tasks as well as the task specified. This only
147 * applies to new descendents, not to any existing descendents at the
148 * time the counter is created (nor to any new descendents of existing
149 * descendents).
151 * The 'pinned' bit, if set, specifies that the counter should always be
152 * on the CPU if at all possible. It only applies to hardware counters
153 * and only to group leaders. If a pinned counter cannot be put onto the
154 * CPU (e.g. because there are not enough hardware counters or because of
155 * a conflict with some other event), then the counter goes into an
156 * 'error' state, where reads return end-of-file (i.e. read() returns 0)
157 * until the counter is subsequently enabled or disabled.
159 * The 'exclusive' bit, if set, specifies that when this counter's group
160 * is on the CPU, it should be the only group using the CPU's counters.
161 * In future, this will allow sophisticated monitoring programs to supply
162 * extra configuration information via 'extra_config_len' to exploit
163 * advanced features of the CPU's Performance Monitor Unit (PMU) that are
164 * not otherwise accessible and that might disrupt other hardware
165 * counters.
167 * The 'exclude_user', 'exclude_kernel' and 'exclude_hv' bits provide a
168 * way to request that counting of events be restricted to times when the
169 * CPU is in user, kernel and/or hypervisor mode.
171 struct perf_event_attr {
173 * Major type: hardware/software/tracepoint/etc.
175 __u32 type;
177 * Size of the attr structure, for fwd/bwd compat.
179 __u32 size;
181 * Type specific configuration information.
183 __u64 config;
184 union {
185 __u64 sample_period;
186 __u64 sample_freq;
188 __u64 sample_type;
189 __u64 read_format;
190 __u64 disabled:1, /* off by default */
191 inherit:1, /* children inherit it */
192 pinned:1, /* must always be on PMU */
193 exclusive:1, /* only group on PMU */
194 exclude_user:1, /* don't count user */
195 exclude_kernel:1, /* ditto kernel */
196 exclude_hv:1, /* ditto hypervisor */
197 exclude_idle:1, /* don't count when idle */
198 mmap:1, /* include mmap data */
199 comm:1, /* include comm data */
200 freq:1, /* use freq, not period */
201 inherit_stat:1, /* per task counts */
202 enable_on_exec:1, /* next exec enables */
203 task:1, /* trace fork/exit */
204 watermark:1, /* wakeup_watermark */
205 precise_ip:2, /* skid constraint */
206 mmap_data:1, /* non-exec mmap data */
207 __reserved_1:46;
208 union {
209 __u32 wakeup_events; /* wakeup every n events */
210 __u32 wakeup_watermark; /* bytes before wakeup */
212 __u32 bp_type;
213 __u64 bp_addr;
214 __u64 bp_len;
217 enum perf_event_ioc_flags {
218 PERF_IOC_FLAG_GROUP = 1U << 0,
222 * Ioctls that can be done on a perf event fd:
224 #define PERF_EVENT_IOC_ENABLE _IO ('$', 0)
225 #define PERF_EVENT_IOC_DISABLE _IO ('$', 1)
226 #define PERF_EVENT_IOC_REFRESH _IO ('$', 2)
228 #define ARRAY_SIZE(x) (sizeof(x) / sizeof(x[0]))
229 #define MAX_COUNTERS 32
231 #define FDS_INVALID -1
232 #define GRP_INVALID -1
234 #define MODE_KERNEL 1
235 #define MODE_USER 2
236 #define MODE_HYPER 4
237 #define MODE_IDLE 8
239 #ifndef likely
240 # define likely(x) __builtin_expect(!!(x), 1)
241 #endif
242 #ifndef unlikely
243 # define unlikely(x) __builtin_expect(!!(x), 0)
244 #endif
245 #ifndef bug
246 # define bug() __builtin_trap()
247 #endif
249 #define PROGNAME "pstat"
250 #define VERSNAME "0.9"
253 * Constants
255 enum tracepoint {
256 /* Software tracepoints */
257 COUNT_SW_CPU_CLOCK = 0,
258 COUNT_SW_TASK_CLOCK = 1,
259 COUNT_SW_CONTEXT_SWITCHES = 2,
260 COUNT_SW_CPU_MIGRATIONS = 3,
261 COUNT_SW_PAGE_FAULTS = 4,
262 COUNT_SW_PAGE_FAULTS_MIN = 5,
263 COUNT_SW_PAGE_FAULTS_MAJ = 6,
264 /* Hardware counters */
265 COUNT_HW_CPU_CYCLES = 7,
266 COUNT_HW_INSTRUCTIONS = 8,
267 COUNT_HW_CACHE_REFERENCES = 9,
268 COUNT_HW_CACHE_MISSES = 10,
269 COUNT_HW_BRANCH_INSTRUCTIONS = 11,
270 COUNT_HW_BRANCH_MISSES = 12,
271 COUNT_HW_BUS_CYCLES = 13,
272 /* Cache counters */
273 /* L1D - data cache */
274 COUNT_HW_CACHE_L1D_LOADS = 14,
275 COUNT_HW_CACHE_L1D_LOADS_MISSES = 15,
276 COUNT_HW_CACHE_L1D_STORES = 16,
277 COUNT_HW_CACHE_L1D_STORES_MISSES = 17,
278 COUNT_HW_CACHE_L1D_PREFETCHES = 18,
279 /* L1I - Instruction cache */
280 COUNT_HW_CACHE_L1I_LOADS = 19,
281 COUNT_HW_CACHE_L1I_LOADS_MISSES = 20,
282 /* LL - Last level cache */
283 COUNT_HW_CACHE_LL_LOADS = 21,
284 COUNT_HW_CACHE_LL_LOADS_MISSES = 22,
285 COUNT_HW_CACHE_LL_STORES = 23,
286 COUNT_HW_CACHE_LL_STORES_MISSES = 24,
287 /* DTLB - Data translation lookaside buffer */
288 COUNT_HW_CACHE_DTLB_LOADS = 25,
289 COUNT_HW_CACHE_DTLB_LOADS_MISSES = 26,
290 COUNT_HW_CACHE_DTLB_STORES = 27,
291 COUNT_HW_CACHE_DTLB_STORES_MISSES = 28,
292 /* ITLB - Instructiont translation lookaside buffer */
293 COUNT_HW_CACHE_ITLB_LOADS = 29,
294 COUNT_HW_CACHE_ITLB_LOADS_MISSES = 30,
295 /* BPU - Branch prediction unit */
296 COUNT_HW_CACHE_BPU_LOADS = 31,
297 COUNT_HW_CACHE_BPU_LOADS_MISSES = 32,
298 /* Internal */
299 INTERNAL_SW_WALL_TIME = 33,
300 INTERNAL_INVALID_TP = 34
303 struct trace_map {
304 char *name;
305 /* char *description; */
306 enum tracepoint tracepoint;
309 #define TRACE_MAP_SET(x) \
311 .name = #x, \
312 .tracepoint = x \
315 struct trace_map whole_map[] = {
316 TRACE_MAP_SET(COUNT_SW_CPU_CLOCK),
317 TRACE_MAP_SET(COUNT_SW_TASK_CLOCK),
318 TRACE_MAP_SET(COUNT_SW_CONTEXT_SWITCHES),
319 TRACE_MAP_SET(COUNT_SW_CPU_MIGRATIONS),
320 TRACE_MAP_SET(COUNT_SW_PAGE_FAULTS),
321 TRACE_MAP_SET(COUNT_SW_PAGE_FAULTS_MIN),
322 TRACE_MAP_SET(COUNT_SW_PAGE_FAULTS_MAJ),
323 TRACE_MAP_SET(COUNT_HW_CPU_CYCLES),
324 TRACE_MAP_SET(COUNT_HW_INSTRUCTIONS),
325 TRACE_MAP_SET(COUNT_HW_CACHE_REFERENCES),
326 TRACE_MAP_SET(COUNT_HW_CACHE_MISSES),
327 TRACE_MAP_SET(COUNT_HW_BRANCH_INSTRUCTIONS),
328 TRACE_MAP_SET(COUNT_HW_BRANCH_MISSES),
329 TRACE_MAP_SET(COUNT_HW_BUS_CYCLES),
330 TRACE_MAP_SET(COUNT_HW_CACHE_L1D_LOADS),
331 TRACE_MAP_SET(COUNT_HW_CACHE_L1D_LOADS_MISSES),
332 TRACE_MAP_SET(COUNT_HW_CACHE_L1D_STORES),
333 TRACE_MAP_SET(COUNT_HW_CACHE_L1D_STORES_MISSES),
334 TRACE_MAP_SET(COUNT_HW_CACHE_L1D_PREFETCHES),
335 TRACE_MAP_SET(COUNT_HW_CACHE_L1I_LOADS),
336 TRACE_MAP_SET(COUNT_HW_CACHE_L1I_LOADS_MISSES),
337 TRACE_MAP_SET(COUNT_HW_CACHE_LL_LOADS),
338 TRACE_MAP_SET(COUNT_HW_CACHE_LL_LOADS_MISSES),
339 TRACE_MAP_SET(COUNT_HW_CACHE_LL_STORES),
340 TRACE_MAP_SET(COUNT_HW_CACHE_LL_STORES_MISSES),
341 TRACE_MAP_SET(COUNT_HW_CACHE_ITLB_LOADS),
342 TRACE_MAP_SET(COUNT_HW_CACHE_ITLB_LOADS_MISSES),
343 TRACE_MAP_SET(COUNT_HW_CACHE_BPU_LOADS),
344 TRACE_MAP_SET(COUNT_HW_CACHE_BPU_LOADS_MISSES),
345 TRACE_MAP_SET(INTERNAL_SW_WALL_TIME),
346 TRACE_MAP_SET(INTERNAL_INVALID_TP)
349 static struct perf_event_attr default_attrs[] = {
350 { /* Software attributes */
351 .type = PERF_TYPE_SOFTWARE,
352 .config = PERF_COUNT_SW_CPU_CLOCK
353 }, {
354 .type = PERF_TYPE_SOFTWARE,
355 .config = PERF_COUNT_SW_TASK_CLOCK
356 }, {
357 .type = PERF_TYPE_SOFTWARE,
358 .config = PERF_COUNT_SW_CONTEXT_SWITCHES
359 }, {
360 .type = PERF_TYPE_SOFTWARE,
361 .config = PERF_COUNT_SW_CPU_MIGRATIONS
362 }, {
363 .type = PERF_TYPE_SOFTWARE,
364 .config = PERF_COUNT_SW_PAGE_FAULTS
365 }, {
366 .type = PERF_TYPE_SOFTWARE,
367 .config = PERF_COUNT_SW_PAGE_FAULTS_MIN
368 }, {
369 .type = PERF_TYPE_SOFTWARE,
370 .config = PERF_COUNT_SW_PAGE_FAULTS_MAJ
371 }, { /* Hardware attributes */
372 .type = PERF_TYPE_HARDWARE,
373 .config = PERF_COUNT_HW_CPU_CYCLES
374 }, {
375 .type = PERF_TYPE_HARDWARE,
376 .config = PERF_COUNT_HW_INSTRUCTIONS
377 }, {
378 .type = PERF_TYPE_HARDWARE,
379 .config = PERF_COUNT_HW_CACHE_REFERENCES
380 }, {
381 .type = PERF_TYPE_HARDWARE,
382 .config = PERF_COUNT_HW_CACHE_MISSES
383 }, {
384 .type = PERF_TYPE_HARDWARE,
385 .config = PERF_COUNT_HW_BRANCH_INSTRUCTIONS
386 }, {
387 .type = PERF_TYPE_HARDWARE,
388 .config = PERF_COUNT_HW_BRANCH_MISSES
389 }, {
390 .type = PERF_TYPE_HARDWARE,
391 .config = PERF_COUNT_HW_BUS_CYCLES
392 }, { /* Caching attributes */
393 .type = PERF_TYPE_HW_CACHE,
394 .config = ((PERF_COUNT_HW_CACHE_L1D << 0) |
395 (PERF_COUNT_HW_CACHE_OP_READ << 8) |
396 (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16))
397 }, {
398 .type = PERF_TYPE_HW_CACHE,
399 .config = ((PERF_COUNT_HW_CACHE_L1D << 0) |
400 (PERF_COUNT_HW_CACHE_OP_READ << 8) |
401 (PERF_COUNT_HW_CACHE_RESULT_MISS << 16))
402 }, {
403 .type = PERF_TYPE_HW_CACHE,
404 .config = ((PERF_COUNT_HW_CACHE_L1D << 0) |
405 (PERF_COUNT_HW_CACHE_OP_WRITE << 8) |
406 (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16))
407 }, {
408 .type = PERF_TYPE_HW_CACHE,
409 .config = ((PERF_COUNT_HW_CACHE_L1D << 0) |
410 (PERF_COUNT_HW_CACHE_OP_WRITE << 8) |
411 (PERF_COUNT_HW_CACHE_RESULT_MISS << 16))
412 }, {
413 .type = PERF_TYPE_HW_CACHE,
414 .config = ((PERF_COUNT_HW_CACHE_L1D << 0) |
415 (PERF_COUNT_HW_CACHE_OP_PREFETCH << 8) |
416 (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16))
417 }, {
418 .type = PERF_TYPE_HW_CACHE,
419 .config = ((PERF_COUNT_HW_CACHE_L1I << 0) |
420 (PERF_COUNT_HW_CACHE_OP_READ << 8) |
421 (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16))
422 }, {
423 .type = PERF_TYPE_HW_CACHE,
424 .config = ((PERF_COUNT_HW_CACHE_L1I << 0) |
425 (PERF_COUNT_HW_CACHE_OP_READ << 8) |
426 (PERF_COUNT_HW_CACHE_RESULT_MISS << 16))
427 }, {
428 .type = PERF_TYPE_HW_CACHE,
429 .config = ((PERF_COUNT_HW_CACHE_LL << 0) |
430 (PERF_COUNT_HW_CACHE_OP_READ << 8) |
431 (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16))
432 }, {
433 .type = PERF_TYPE_HW_CACHE,
434 .config = ((PERF_COUNT_HW_CACHE_LL << 0) |
435 (PERF_COUNT_HW_CACHE_OP_READ << 8) |
436 (PERF_COUNT_HW_CACHE_RESULT_MISS << 16))
437 }, {
438 .type = PERF_TYPE_HW_CACHE,
439 .config = ((PERF_COUNT_HW_CACHE_LL << 0) |
440 (PERF_COUNT_HW_CACHE_OP_WRITE << 8) |
441 (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16))
442 }, {
443 .type = PERF_TYPE_HW_CACHE,
444 .config = ((PERF_COUNT_HW_CACHE_LL << 0) |
445 (PERF_COUNT_HW_CACHE_OP_WRITE << 8) |
446 (PERF_COUNT_HW_CACHE_RESULT_MISS << 16))
447 }, {
448 .type = PERF_TYPE_HW_CACHE,
449 .config = ((PERF_COUNT_HW_CACHE_DTLB << 0) |
450 (PERF_COUNT_HW_CACHE_OP_READ << 8) |
451 (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16))
452 }, {
453 .type = PERF_TYPE_HW_CACHE,
454 .config = ((PERF_COUNT_HW_CACHE_DTLB << 0) |
455 (PERF_COUNT_HW_CACHE_OP_READ << 8) |
456 (PERF_COUNT_HW_CACHE_RESULT_MISS << 16))
457 }, {
458 .type = PERF_TYPE_HW_CACHE,
459 .config = ((PERF_COUNT_HW_CACHE_DTLB << 0) |
460 (PERF_COUNT_HW_CACHE_OP_WRITE << 8) |
461 (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16))
462 }, {
463 .type = PERF_TYPE_HW_CACHE,
464 .config = ((PERF_COUNT_HW_CACHE_DTLB << 0) |
465 (PERF_COUNT_HW_CACHE_OP_WRITE << 8) |
466 (PERF_COUNT_HW_CACHE_RESULT_MISS << 16))
467 }, {
468 .type = PERF_TYPE_HW_CACHE,
469 .config = ((PERF_COUNT_HW_CACHE_ITLB << 0) |
470 (PERF_COUNT_HW_CACHE_OP_READ << 8) |
471 (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16))
472 }, {
473 .type = PERF_TYPE_HW_CACHE,
474 .config = ((PERF_COUNT_HW_CACHE_ITLB << 0) |
475 (PERF_COUNT_HW_CACHE_OP_READ << 8) |
476 (PERF_COUNT_HW_CACHE_RESULT_MISS << 16))
477 }, {
478 .type = PERF_TYPE_HW_CACHE,
479 .config = ((PERF_COUNT_HW_CACHE_BPU << 0) |
480 (PERF_COUNT_HW_CACHE_OP_READ << 8) |
481 (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16))
482 }, {
483 .type = PERF_TYPE_HW_CACHE,
484 .config = ((PERF_COUNT_HW_CACHE_BPU << 0) |
485 (PERF_COUNT_HW_CACHE_OP_READ << 8) |
486 (PERF_COUNT_HW_CACHE_RESULT_MISS << 16))
490 extern int optind;
492 static sig_atomic_t sigint = 0;
494 static const char *short_options = "p:c:ekuyvhlx:i";
496 static struct option long_options[] = {
497 {"pid", required_argument, 0, 'p'},
498 {"cpu", required_argument, 0, 'c'},
499 {"use", required_argument, 0, 'x'},
500 {"excl", no_argument, 0, 'e'},
501 {"kernel", no_argument, 0, 'k'},
502 {"user", no_argument, 0, 'u'},
503 {"hyper", no_argument, 0, 'y'},
504 {"idle", no_argument, 0, 'i'},
505 {"list", no_argument, 0, 'l'},
506 {"version", no_argument, 0, 'v'},
507 {"help", no_argument, 0, 'h'},
508 {0, 0, 0, 0}
511 struct perf_data {
512 pid_t pid;
513 int cpu;
514 int group;
515 int fds[MAX_COUNTERS];
516 struct perf_event_attr *attrs;
517 unsigned long long wall_start;
520 static inline void die(void)
522 exit(EXIT_FAILURE);
525 static void usage(void)
527 printf("\n%s %s\n", PROGNAME, VERSNAME);
528 printf("Usage: %s [options] [<cmd>]\n", PROGNAME);
529 printf("Options:\n");
530 printf(" -p|--pid <pid> Attach to running process/kthread\n");
531 printf(" -c|--cpu <cpu> Bind counter to cpuid\n");
532 printf(" -e|--excl Be exclusive counter group on CPU\n");
533 printf(" -k|--kernel Count events in kernel mode\n");
534 printf(" -u|--user Count events in user mode\n");
535 printf(" -y|--hyper Count events in hypervisor mode\n");
536 printf(" -i|--idle Do also count when idle\n");
537 printf(" -l|--list List possible events\n");
538 printf(" -x|--use <event> Count only a certain event\n");
539 printf(" -v|--version Print version\n");
540 printf(" -h|--help Print this help\n");
541 printf("\n");
542 printf("Please report bugs to <dborkma@tik.ee.ethz.ch>\n");
543 printf("Copyright (C) 2011 Daniel Borkmann\n");
544 printf("License: GNU GPL version 2\n");
545 printf("This is free software: you are free to change and redistribute it.\n");
546 printf("There is NO WARRANTY, to the extent permitted by law.\n\n");
548 die();
551 static void version(void)
553 printf("\n%s %s\n", PROGNAME, VERSNAME);
554 printf("Please report bugs to <dborkma@tik.ee.ethz.ch>\n");
555 printf("Copyright (C) 2011 Daniel Borkmann\n");
556 printf("License: GNU GPL version 2\n");
557 printf("This is free software: you are free to change and redistribute it.\n");
558 printf("There is NO WARRANTY, to the extent permitted by law.\n\n");
560 die();
563 static void reaper(int sig)
565 int pid, status;
567 while ((pid = waitpid(-1, &status, WNOHANG)) > 0)
571 static void intr(int sig)
573 sigint = 1;
576 static inline void register_signal(int signal, void (*handler)(int))
578 sigset_t block_mask;
579 struct sigaction saction;
581 sigfillset(&block_mask);
582 saction.sa_handler = handler;
583 saction.sa_mask = block_mask;
584 saction.sa_flags = SA_RESTART;
586 sigaction(signal, &saction, NULL);
589 static inline unsigned long long rdclock(void)
591 struct timespec ts;
593 clock_gettime(CLOCK_MONOTONIC, &ts);
594 return ts.tv_sec * 1000000000ULL + ts.tv_nsec;
597 static inline void panic(char *msg, ...)
599 va_list vl;
600 va_start(vl, msg);
601 vfprintf(stderr, msg, vl);
602 va_end(vl);
604 die();
607 static inline void whine(char *msg, ...)
609 va_list vl;
610 va_start(vl, msg);
611 vfprintf(stderr, msg, vl);
612 va_end(vl);
615 static void *xzmalloc(size_t size)
617 void *ptr;
619 if (unlikely(size == 0))
620 panic("xzmalloc: zero size\n");
622 ptr = malloc(size);
623 if (unlikely(ptr == NULL))
624 panic("xzmalloc: out of memory (allocating %lu bytes)\n",
625 (u_long) size);
626 memset(ptr, 0, size);
628 return ptr;
631 static void xfree(void *ptr)
633 if (unlikely(ptr == NULL))
634 panic("xfree: NULL pointer given as argument\n");
635 free(ptr);
639 * The 'group_fd' parameter allows counter "groups" to be set up. A
640 * counter group has one counter which is the group "leader". The leader
641 * is created first, with group_fd = -1 in the perf_event_open call
642 * that creates it. The rest of the group members are created
643 * subsequently, with group_fd giving the fd of the group leader.
644 * (A single counter on its own is created with group_fd = -1 and is
645 * considered to be a group with only 1 member.)
647 * A counter group is scheduled onto the CPU as a unit, that is, it will
648 * only be put onto the CPU if all of the counters in the group can be
649 * put onto the CPU. This means that the values of the member counters
650 * can be meaningfully compared, added, divided (to get ratios), etc.,
651 * with each other, since they have counted events for the same set of
652 * executed instructions.
654 static inline int sys_perf_event_open(struct perf_event_attr *attr, pid_t pid,
655 int cpu, int group_fd,
656 unsigned long flags)
659 * PID settings:
660 * pid == 0: counter attached to current task
661 * pid > 0: counter attached to specific task
662 * pid < 0: counter attached to all tasks
663 * CPU settings:
664 * cpu >= 0: counter restricted to a specific CPU
665 * cpu == -1: counter counts on all CPUs
666 * User/kernel/hypervisor modes:
667 * See attr bits for excluding stuff!
668 * Note: pid == -1 && cpu == -1 is invalid!
669 * flags must be 0!
671 attr->size = sizeof(*attr);
672 return syscall(__NR_perf_event_open, attr, pid, cpu, group_fd, flags);
675 static inline pid_t gettid()
677 return syscall(SYS_gettid);
680 static struct perf_data *initialize(pid_t pid, int cpu, int mode, int excl)
682 int i;
683 struct perf_data *pd;
684 struct perf_event_attr *attr;
685 struct perf_event_attr *attrs;
687 pd = xzmalloc(sizeof(*pd));
688 if (pid < 0)
689 pid = gettid();
690 pd->group = GRP_INVALID;
691 for (i = 0; i < ARRAY_SIZE(pd->fds); i++)
692 pd->fds[i] = FDS_INVALID;
693 pd->pid = pid;
694 pd->cpu = cpu;
696 attrs = xzmalloc(sizeof(*attrs) * ARRAY_SIZE(default_attrs));
697 memcpy(attrs, default_attrs, sizeof(default_attrs));
698 pd->attrs = attrs;
700 for (i = 0; i < ARRAY_SIZE(default_attrs); i++) {
701 attr = &attrs[i];
703 attr->inherit = 1;
704 attr->disabled = 1;
705 attr->enable_on_exec = 0;
706 attr->exclusive = excl;
707 attr->exclude_user = ((mode & MODE_USER) == 0);
708 attr->exclude_kernel = ((mode & MODE_KERNEL) == 0);
709 attr->exclude_hv = ((mode & MODE_HYPER) == 0);
710 attr->exclude_idle = ((mode & MODE_IDLE) == 0);
711 /* pd->fds[0] is counter group leader! */
712 pd->fds[i] = sys_perf_event_open(attr, pid, cpu,
713 i == 0 ? GRP_INVALID : pd->fds[0],
714 PERF_IOC_FLAG_GROUP);
715 if (unlikely(pd->fds[i] < 0))
716 panic("sys_perf_event_open failed: %s\n", strerror(errno));
719 pd->group = pd->fds[0];
720 return pd;
724 * A read() on a counter returns the current value of the counter and possible
725 * additional values as specified by 'read_format', each value is a u64 (8 bytes)
726 * in size.
728 static uint64_t read_counter(struct perf_data *pd, int counter)
730 int ret;
731 uint64_t value;
733 if (counter == INTERNAL_SW_WALL_TIME)
734 return (uint64_t) (rdclock() - pd->wall_start);
735 if (unlikely(counter < 0 || counter > MAX_COUNTERS))
736 panic("bug! invalid counter value!\n");
738 ret = read(pd->fds[counter], &value, sizeof(uint64_t));
739 if (unlikely(ret != sizeof(uint64_t)))
740 panic("perf_counter read error!\n");
742 return value;
746 * Counters can be enabled and disabled in two ways: via ioctl and via
747 * prctl. When a counter is disabled, it doesn't count or generate
748 * events but does continue to exist and maintain its count value.
750 * Enabling or disabling the leader of a group enables or disables the
751 * whole group; that is, while the group leader is disabled, none of the
752 * counters in the group will count. Enabling or disabling a member of a
753 * group other than the leader only affects that counter - disabling an
754 * non-leader stops that counter from counting but doesn't affect any
755 * other counter.
757 static void enable_counter(struct perf_data *pd, int counter)
759 int ret;
761 if (unlikely(counter < 0 || counter >= MAX_COUNTERS))
762 panic("bug! invalid counter value!\n");
763 if (pd->fds[counter] == FDS_INVALID) {
764 pd->fds[counter] = sys_perf_event_open(&pd->attrs[counter],
765 pd->pid,pd->cpu,
766 pd->group,
767 PERF_IOC_FLAG_GROUP);
768 if (unlikely(pd->fds[counter] < 0))
769 panic("sys_perf_event_open failed!\n");
772 ret = ioctl(pd->fds[counter], PERF_EVENT_IOC_ENABLE);
773 if (ret)
774 panic("error enabling perf counter!\n");
776 pd->wall_start = rdclock();
779 static void enable_all_counter(struct perf_data *pd)
781 int ret, i;
783 for (i = 0; i < MAX_COUNTERS; i++) {
784 enable_counter(pd, i);
787 /* XXX: Only group leader? */
788 #if 0
789 for (i = 0; i < MAX_COUNTERS; i++) {
790 /* ret = ioctl(pd->group, PERF_EVENT_IOC_ENABLE); */
791 ret = ioctl(pd->fds[i], PERF_EVENT_IOC_ENABLE);
792 if (ret)
793 panic("error enabling perf counter!\n");
795 #endif
796 pd->wall_start = rdclock();
799 static void disable_counter(struct perf_data *pd, int counter)
801 int ret;
803 if (unlikely(counter < 0 || counter >= MAX_COUNTERS))
804 panic("bug! invalid counter value!\n");
805 if (pd->fds[counter] == FDS_INVALID)
806 return;
808 ret = ioctl(pd->fds[counter], PERF_EVENT_IOC_DISABLE);
809 if (ret)
810 panic("error disabling perf counter!\n");
813 static void disable_all_counter(struct perf_data *pd)
815 int ret, i;
817 /* XXX: Only group leader? */
818 for (i = 0; i < MAX_COUNTERS; i++) {
819 disable_counter(pd, i);
823 static void cleanup(struct perf_data *pd)
825 int i;
827 for (i = 0; i < ARRAY_SIZE(default_attrs); i++)
828 if (pd->fds[i] >= 0)
829 close(pd->fds[i]);
830 xfree(pd->attrs);
831 xfree(pd);
834 static void list_counter(void)
836 int i;
838 for (i = 0; i < ARRAY_SIZE(whole_map); i++)
839 printf("%s\n", whole_map[i].name);
841 die();
844 static enum tracepoint lookup_counter(char *name)
846 int i;
848 for (i = 0; i < ARRAY_SIZE(whole_map); i++)
849 if (!strncmp(whole_map[i].name, name, sizeof(whole_map[i].name) - 1))
850 return whole_map[i].tracepoint;
851 return INTERNAL_INVALID_TP;
854 static void print_whole_result(struct perf_data *pd)
856 uint64_t tmp1, tmp2;
858 printf("Software counters:\n");
859 printf(" CPU clock ticks %" PRIu64 "\n", read_counter(pd, COUNT_SW_CPU_CLOCK));
860 printf(" task clock ticks %" PRIu64 "\n", read_counter(pd, COUNT_SW_TASK_CLOCK));
861 printf(" CPU context switches %" PRIu64 "\n", read_counter(pd, COUNT_SW_CONTEXT_SWITCHES));
862 printf(" CPU migrations %" PRIu64 "\n", read_counter(pd, COUNT_SW_CPU_MIGRATIONS));
863 printf(" pagefaults/minor/major %" PRIu64 "/%" PRIu64 "/%" PRIu64 "\n",
864 read_counter(pd, COUNT_SW_PAGE_FAULTS),
865 read_counter(pd, COUNT_SW_PAGE_FAULTS_MIN),
866 read_counter(pd, COUNT_SW_PAGE_FAULTS_MAJ));
867 printf("Hardware counters:\n");
868 printf(" CPU cycles %" PRIu64 "\n", read_counter(pd, COUNT_HW_CPU_CYCLES));
869 printf(" instructions %" PRIu64 "\n", read_counter(pd, COUNT_HW_INSTRUCTIONS));
870 tmp1 = read_counter(pd, COUNT_HW_CACHE_REFERENCES);
871 tmp2 = read_counter(pd, COUNT_HW_CACHE_MISSES);
872 printf(" cache references %" PRIu64 "\n", tmp1);
873 printf(" cache misses (rate) %" PRIu64 " (%.4lf %%)\n", tmp2, (1.0 * tmp2 / tmp1) * 100.0);
874 tmp1 = read_counter(pd, COUNT_HW_BRANCH_INSTRUCTIONS);
875 tmp2 = read_counter(pd, COUNT_HW_BRANCH_MISSES);
876 printf(" branch instructions %" PRIu64 "\n", tmp1);
877 printf(" branch misses (rate) %" PRIu64 " (%.4lf %%)\n", tmp2, (1.0 * tmp2 / tmp1) * 100.0);
878 printf(" bus cycles %" PRIu64 "\n", read_counter(pd, COUNT_HW_BUS_CYCLES));
879 printf("L1D, data cache:\n");
880 printf(" loads %" PRIu64 "\n", read_counter(pd, COUNT_HW_CACHE_L1D_LOADS));
881 printf(" load misses %" PRIu64 "\n", read_counter(pd, COUNT_HW_CACHE_L1D_LOADS_MISSES));
882 printf(" stores %" PRIu64 "\n", read_counter(pd, COUNT_HW_CACHE_L1D_STORES));
883 printf(" store misses %" PRIu64 "\n", read_counter(pd, COUNT_HW_CACHE_L1D_STORES_MISSES));
884 printf(" prefetches %" PRIu64 "\n", read_counter(pd, COUNT_HW_CACHE_L1D_PREFETCHES));
885 printf("L1I, instruction cache:\n");
886 printf(" loads %" PRIu64 "\n", read_counter(pd, COUNT_HW_CACHE_L1I_LOADS));
887 printf(" load misses %" PRIu64 "\n", read_counter(pd, COUNT_HW_CACHE_L1I_LOADS_MISSES));
888 printf("LL, last level cache:\n");
889 printf(" loads %" PRIu64 "\n", read_counter(pd, COUNT_HW_CACHE_LL_LOADS));
890 printf(" load misses %" PRIu64 "\n", read_counter(pd, COUNT_HW_CACHE_LL_LOADS_MISSES));
891 printf(" stores %" PRIu64 "\n", read_counter(pd, COUNT_HW_CACHE_LL_STORES));
892 printf(" store misses %" PRIu64 "\n", read_counter(pd, COUNT_HW_CACHE_LL_STORES_MISSES));
893 printf("DTLB, data translation lookaside buffer:\n");
894 printf(" loads %" PRIu64 "\n", read_counter(pd, COUNT_HW_CACHE_DTLB_LOADS));
895 printf(" load misses %" PRIu64 "\n", read_counter(pd, COUNT_HW_CACHE_DTLB_LOADS_MISSES));
896 printf(" stores %" PRIu64 "\n", read_counter(pd, COUNT_HW_CACHE_DTLB_STORES));
897 printf(" store misses %" PRIu64 "\n", read_counter(pd, COUNT_HW_CACHE_DTLB_STORES_MISSES));
898 printf("ILLB, instruction translation lookaside buffer:\n");
899 printf(" loads %" PRIu64 "\n", read_counter(pd, COUNT_HW_CACHE_ITLB_LOADS));
900 printf(" load misses %" PRIu64 "\n", read_counter(pd, COUNT_HW_CACHE_ITLB_LOADS_MISSES));
901 printf("BPU, branch prediction unit:\n");
902 printf(" loads %" PRIu64 "\n", read_counter(pd, COUNT_HW_CACHE_BPU_LOADS));
903 printf(" load misses %" PRIu64 "\n", read_counter(pd, COUNT_HW_CACHE_BPU_LOADS_MISSES));
904 printf("Wall-clock time elapsed:\n");
905 printf(" usec %" PRIu64 "\n", read_counter(pd, INTERNAL_SW_WALL_TIME));
908 int main(int argc, char **argv)
910 int status, c, opt_index, mode, pt, cpu, excl, ret;
911 unsigned long cpus;
912 pid_t pid = -1;
913 struct perf_data *pd;
914 enum tracepoint tp;
916 if (argc == 1)
917 usage();
919 cpus = sysconf(_SC_NPROCESSORS_ONLN);
920 cpu = -1;
921 mode = excl = pt = 0;
922 tp = INTERNAL_INVALID_TP;
924 while ((c = getopt_long(argc, argv, short_options, long_options,
925 &opt_index)) != EOF) {
926 switch (c) {
927 case 'h':
928 usage();
929 break;
930 case 'v':
931 version();
932 break;
933 case 'p':
934 pid = atoi(optarg);
935 if (pid < 0)
936 panic("bad pid! either 0 for all procs "
937 "or x > 0!\n");
938 if (pid == 0)
939 pid = -1;
940 else
941 pt = 1;
942 whine("not yet working correctly!\n");
943 break;
944 case 'c':
945 cpu = atoi(optarg);
946 if (cpu < 0 || cpu >= cpus)
947 panic("bad cpuid! needs to be 0 <= x < "
948 "%lu!\n", cpus);
949 break;
950 case 'e':
951 excl = 1;
952 break;
953 case 'k':
954 mode |= MODE_KERNEL;
955 break;
956 case 'u':
957 mode |= MODE_USER;
958 break;
959 case 'y':
960 mode |= MODE_HYPER;
961 break;
962 case 'i':
963 mode |= MODE_IDLE;
964 break;
965 case 'l':
966 list_counter();
967 break;
968 case 'x':
969 tp = lookup_counter(optarg);
970 printf("found: %d\n", tp);
971 break;
972 default:
973 usage();
974 break;
978 if (pt && pid == -1 && cpu == -1)
979 panic("either all procs on a single core or all cpus on a "
980 "single proc, but not both!\n");
982 if (mode == 0)
983 mode = MODE_KERNEL | MODE_USER | MODE_HYPER;
985 if (!pt)
986 pid = fork();
987 else {
988 register_signal(SIGCHLD, reaper);
989 ret = ptrace(PT_ATTACH, pid, (char *) 1, 0);
990 if (ret < 0) {
991 panic("cannot attach to process!\n");
992 perror("");
994 fprintf(stderr, "Process %u attached - interrupt to quit\n",
995 pid);
998 pd = initialize(pid, cpu, mode, excl);
999 if (tp == INTERNAL_INVALID_TP)
1000 enable_all_counter(pd);
1001 else
1002 enable_counter(pd, tp);
1004 if (!pt && !pid) {
1005 execvp(argv[optind], &argv[optind]);
1006 die();
1008 register_signal(SIGINT, intr);
1009 wait(&status);
1010 if (tp == INTERNAL_INVALID_TP)
1011 disable_all_counter(pd);
1012 else
1013 disable_counter(pd, tp);
1015 if (pt) {
1016 ret = ptrace(PT_DETACH, pid, (char *) 1, SIGCONT);
1017 if (ret < 0) {
1018 panic("cannot detach from process!\n");
1019 perror("");
1021 fprintf(stderr, "Process %u detached\n", pid);
1024 if (cpu == -1)
1025 printf("CPU: all, PID: %d\n", pid);
1026 else
1027 printf("CPU: %d, PID: %d\n", cpu, pid);
1028 printf("Kernel: %s, User: %s, Hypervisor: %s\n",
1029 (mode & MODE_KERNEL) == MODE_KERNEL ? "on" : "off",
1030 (mode & MODE_USER) == MODE_USER ? "on" : "off",
1031 (mode & MODE_HYPER) == MODE_HYPER ? "on" : "off");
1033 if (tp == INTERNAL_INVALID_TP)
1034 print_whole_result(pd);
1035 else
1036 printf("%" PRIu64 " in %" PRIu64 " usec\n",
1037 read_counter(pd, tp),
1038 read_counter(pd, INTERNAL_SW_WALL_TIME));
1039 cleanup(pd);
1041 return 0;