perf_counter: kerneltop: parse the mmap data stream
[linux-2.6/verdex.git] / Documentation / perf_counter / kerneltop.c
blob995111dee7fba4ffe808ad0f99641d8d2235fb3d
1 /*
2 * kerneltop.c: show top kernel functions - performance counters showcase
4 Build with:
6 cc -O6 -Wall -c -o kerneltop.o kerneltop.c -lrt
8 Sample output:
10 ------------------------------------------------------------------------------
11 KernelTop: 2669 irqs/sec [NMI, cache-misses/cache-refs], (all, cpu: 2)
12 ------------------------------------------------------------------------------
14 weight RIP kernel function
15 ______ ________________ _______________
17 35.20 - ffffffff804ce74b : skb_copy_and_csum_dev
18 33.00 - ffffffff804cb740 : sock_alloc_send_skb
19 31.26 - ffffffff804ce808 : skb_push
20 22.43 - ffffffff80510004 : tcp_established_options
21 19.00 - ffffffff8027d250 : find_get_page
22 15.76 - ffffffff804e4fc9 : eth_type_trans
23 15.20 - ffffffff804d8baa : dst_release
24 14.86 - ffffffff804cf5d8 : skb_release_head_state
25 14.00 - ffffffff802217d5 : read_hpet
26 12.00 - ffffffff804ffb7f : __ip_local_out
27 11.97 - ffffffff804fc0c8 : ip_local_deliver_finish
28 8.54 - ffffffff805001a3 : ip_queue_xmit
32 * perfstat: /usr/bin/time -alike performance counter statistics utility
34 It summarizes the counter events of all tasks (and child tasks),
35 covering all CPUs that the command (or workload) executes on.
36 It only counts the per-task events of the workload started,
37 independent of how many other tasks run on those CPUs.
39 Sample output:
41 $ ./perfstat -e 1 -e 3 -e 5 ls -lR /usr/include/ >/dev/null
43 Performance counter stats for 'ls':
45 163516953 instructions
46 2295 cache-misses
47 2855182 branch-misses
51 * Copyright (C) 2008, Red Hat Inc, Ingo Molnar <mingo@redhat.com>
53 * Improvements and fixes by:
55 * Arjan van de Ven <arjan@linux.intel.com>
56 * Yanmin Zhang <yanmin.zhang@intel.com>
57 * Wu Fengguang <fengguang.wu@intel.com>
58 * Mike Galbraith <efault@gmx.de>
59 * Paul Mackerras <paulus@samba.org>
61 * Released under the GPL v2. (and only v2, not any later version)
64 #define _GNU_SOURCE
65 #include <sys/types.h>
66 #include <sys/stat.h>
67 #include <sys/time.h>
68 #include <unistd.h>
69 #include <stdint.h>
70 #include <stdlib.h>
71 #include <string.h>
72 #include <limits.h>
73 #include <getopt.h>
74 #include <assert.h>
75 #include <fcntl.h>
76 #include <stdio.h>
77 #include <errno.h>
78 #include <ctype.h>
79 #include <time.h>
81 #include <sys/syscall.h>
82 #include <sys/ioctl.h>
83 #include <sys/poll.h>
84 #include <sys/prctl.h>
85 #include <sys/wait.h>
86 #include <sys/uio.h>
87 #include <sys/mman.h>
89 #include <linux/unistd.h>
90 #include <linux/types.h>
92 #include "../../include/linux/perf_counter.h"
96 * prctl(PR_TASK_PERF_COUNTERS_DISABLE) will (cheaply) disable all
97 * counters in the current task.
99 #define PR_TASK_PERF_COUNTERS_DISABLE 31
100 #define PR_TASK_PERF_COUNTERS_ENABLE 32
102 #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
104 #define rdclock() \
105 ({ \
106 struct timespec ts; \
108 clock_gettime(CLOCK_MONOTONIC, &ts); \
109 ts.tv_sec * 1000000000ULL + ts.tv_nsec; \
113 * Pick up some kernel type conventions:
115 #define __user
116 #define asmlinkage
118 #ifdef __x86_64__
119 #define __NR_perf_counter_open 295
120 #define rmb() asm volatile("lfence" ::: "memory")
121 #define cpu_relax() asm volatile("rep; nop" ::: "memory");
122 #endif
124 #ifdef __i386__
125 #define __NR_perf_counter_open 333
126 #define rmb() asm volatile("lfence" ::: "memory")
127 #define cpu_relax() asm volatile("rep; nop" ::: "memory");
128 #endif
130 #ifdef __powerpc__
131 #define __NR_perf_counter_open 319
132 #define rmb() asm volatile ("sync" ::: "memory")
133 #define cpu_relax() asm volatile ("" ::: "memory");
134 #endif
136 #define unlikely(x) __builtin_expect(!!(x), 0)
137 #define min(x, y) ({ \
138 typeof(x) _min1 = (x); \
139 typeof(y) _min2 = (y); \
140 (void) (&_min1 == &_min2); \
141 _min1 < _min2 ? _min1 : _min2; })
143 asmlinkage int sys_perf_counter_open(
144 struct perf_counter_hw_event *hw_event_uptr __user,
145 pid_t pid,
146 int cpu,
147 int group_fd,
148 unsigned long flags)
150 return syscall(
151 __NR_perf_counter_open, hw_event_uptr, pid, cpu, group_fd, flags);
154 #define MAX_COUNTERS 64
155 #define MAX_NR_CPUS 256
157 #define EID(type, id) (((__u64)(type) << PERF_COUNTER_TYPE_SHIFT) | (id))
159 static int run_perfstat = 0;
160 static int system_wide = 0;
162 static int nr_counters = 0;
163 static __u64 event_id[MAX_COUNTERS] = {
164 EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK),
165 EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES),
166 EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS),
167 EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS),
169 EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES),
170 EID(PERF_TYPE_HARDWARE, PERF_COUNT_INSTRUCTIONS),
171 EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_REFERENCES),
172 EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_MISSES),
174 static int default_interval = 100000;
175 static int event_count[MAX_COUNTERS];
176 static int fd[MAX_NR_CPUS][MAX_COUNTERS];
178 static __u64 count_filter = 100;
180 static int tid = -1;
181 static int profile_cpu = -1;
182 static int nr_cpus = 0;
183 static int nmi = 1;
184 static int group = 0;
185 static unsigned int page_size;
186 static unsigned int mmap_pages = 16;
187 static int use_mmap = 0;
188 static int use_munmap = 0;
190 static char *vmlinux;
192 static char *sym_filter;
193 static unsigned long filter_start;
194 static unsigned long filter_end;
196 static int delay_secs = 2;
197 static int zero;
198 static int dump_symtab;
200 struct source_line {
201 uint64_t EIP;
202 unsigned long count;
203 char *line;
204 struct source_line *next;
207 static struct source_line *lines;
208 static struct source_line **lines_tail;
210 const unsigned int default_count[] = {
211 1000000,
212 1000000,
213 10000,
214 10000,
215 1000000,
216 10000,
219 static char *hw_event_names[] = {
220 "CPU cycles",
221 "instructions",
222 "cache references",
223 "cache misses",
224 "branches",
225 "branch misses",
226 "bus cycles",
229 static char *sw_event_names[] = {
230 "cpu clock ticks",
231 "task clock ticks",
232 "pagefaults",
233 "context switches",
234 "CPU migrations",
235 "minor faults",
236 "major faults",
239 struct event_symbol {
240 __u64 event;
241 char *symbol;
244 static struct event_symbol event_symbols[] = {
245 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES), "cpu-cycles", },
246 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES), "cycles", },
247 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_INSTRUCTIONS), "instructions", },
248 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_REFERENCES), "cache-references", },
249 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_MISSES), "cache-misses", },
250 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_INSTRUCTIONS), "branch-instructions", },
251 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_INSTRUCTIONS), "branches", },
252 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_MISSES), "branch-misses", },
253 {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BUS_CYCLES), "bus-cycles", },
255 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_CLOCK), "cpu-clock", },
256 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK), "task-clock", },
257 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS), "page-faults", },
258 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS), "faults", },
259 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS_MIN), "minor-faults", },
260 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS_MAJ), "major-faults", },
261 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES), "context-switches", },
262 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES), "cs", },
263 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS), "cpu-migrations", },
264 {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS), "migrations", },
267 #define __PERF_COUNTER_FIELD(config, name) \
268 ((config & PERF_COUNTER_##name##_MASK) >> PERF_COUNTER_##name##_SHIFT)
270 #define PERF_COUNTER_RAW(config) __PERF_COUNTER_FIELD(config, RAW)
271 #define PERF_COUNTER_CONFIG(config) __PERF_COUNTER_FIELD(config, CONFIG)
272 #define PERF_COUNTER_TYPE(config) __PERF_COUNTER_FIELD(config, TYPE)
273 #define PERF_COUNTER_ID(config) __PERF_COUNTER_FIELD(config, EVENT)
275 static void display_events_help(void)
277 unsigned int i;
278 __u64 e;
280 printf(
281 " -e EVENT --event=EVENT # symbolic-name abbreviations");
283 for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
284 int type, id;
286 e = event_symbols[i].event;
287 type = PERF_COUNTER_TYPE(e);
288 id = PERF_COUNTER_ID(e);
290 printf("\n %d:%d: %-20s",
291 type, id, event_symbols[i].symbol);
294 printf("\n"
295 " rNNN: raw PMU events (eventsel+umask)\n\n");
298 static void display_perfstat_help(void)
300 printf(
301 "Usage: perfstat [<events...>] <cmd...>\n\n"
302 "PerfStat Options (up to %d event types can be specified):\n\n",
303 MAX_COUNTERS);
305 display_events_help();
307 printf(
308 " -a # system-wide collection\n");
309 exit(0);
312 static void display_help(void)
314 if (run_perfstat)
315 return display_perfstat_help();
317 printf(
318 "Usage: kerneltop [<options>]\n"
319 " Or: kerneltop -S [<options>] COMMAND [ARGS]\n\n"
320 "KernelTop Options (up to %d event types can be specified at once):\n\n",
321 MAX_COUNTERS);
323 display_events_help();
325 printf(
326 " -S --stat # perfstat COMMAND\n"
327 " -a # system-wide collection (for perfstat)\n\n"
328 " -c CNT --count=CNT # event period to sample\n\n"
329 " -C CPU --cpu=CPU # CPU (-1 for all) [default: -1]\n"
330 " -p PID --pid=PID # PID of sampled task (-1 for all) [default: -1]\n\n"
331 " -d delay --delay=<seconds> # sampling/display delay [default: 2]\n"
332 " -f CNT --filter=CNT # min-event-count filter [default: 100]\n\n"
333 " -s symbol --symbol=<symbol> # function to be showed annotated one-shot\n"
334 " -x path --vmlinux=<path> # the vmlinux binary, required for -s use\n"
335 " -z --zero # zero counts after display\n"
336 " -D --dump_symtab # dump symbol table to stderr on startup\n"
337 " -m pages --mmap_pages=<pages> # number of mmap data pages\n"
338 " -M --mmap_info # print mmap info stream\n"
339 " -U --munmap_info # print munmap info stream\n"
342 exit(0);
345 static char *event_name(int ctr)
347 __u64 config = event_id[ctr];
348 int type = PERF_COUNTER_TYPE(config);
349 int id = PERF_COUNTER_ID(config);
350 static char buf[32];
352 if (PERF_COUNTER_RAW(config)) {
353 sprintf(buf, "raw 0x%llx", PERF_COUNTER_CONFIG(config));
354 return buf;
357 switch (type) {
358 case PERF_TYPE_HARDWARE:
359 if (id < PERF_HW_EVENTS_MAX)
360 return hw_event_names[id];
361 return "unknown-hardware";
363 case PERF_TYPE_SOFTWARE:
364 if (id < PERF_SW_EVENTS_MAX)
365 return sw_event_names[id];
366 return "unknown-software";
368 default:
369 break;
372 return "unknown";
376 * Each event can have multiple symbolic names.
377 * Symbolic names are (almost) exactly matched.
379 static __u64 match_event_symbols(char *str)
381 __u64 config, id;
382 int type;
383 unsigned int i;
385 if (sscanf(str, "r%llx", &config) == 1)
386 return config | PERF_COUNTER_RAW_MASK;
388 if (sscanf(str, "%d:%llu", &type, &id) == 2)
389 return EID(type, id);
391 for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
392 if (!strncmp(str, event_symbols[i].symbol,
393 strlen(event_symbols[i].symbol)))
394 return event_symbols[i].event;
397 return ~0ULL;
400 static int parse_events(char *str)
402 __u64 config;
404 again:
405 if (nr_counters == MAX_COUNTERS)
406 return -1;
408 config = match_event_symbols(str);
409 if (config == ~0ULL)
410 return -1;
412 event_id[nr_counters] = config;
413 nr_counters++;
415 str = strstr(str, ",");
416 if (str) {
417 str++;
418 goto again;
421 return 0;
426 * perfstat
429 char fault_here[1000000];
431 static void create_perfstat_counter(int counter)
433 struct perf_counter_hw_event hw_event;
435 memset(&hw_event, 0, sizeof(hw_event));
436 hw_event.config = event_id[counter];
437 hw_event.record_type = PERF_RECORD_SIMPLE;
438 hw_event.nmi = 0;
440 if (system_wide) {
441 int cpu;
442 for (cpu = 0; cpu < nr_cpus; cpu ++) {
443 fd[cpu][counter] = sys_perf_counter_open(&hw_event, -1, cpu, -1, 0);
444 if (fd[cpu][counter] < 0) {
445 printf("perfstat error: syscall returned with %d (%s)\n",
446 fd[cpu][counter], strerror(errno));
447 exit(-1);
450 } else {
451 hw_event.inherit = 1;
452 hw_event.disabled = 1;
454 fd[0][counter] = sys_perf_counter_open(&hw_event, 0, -1, -1, 0);
455 if (fd[0][counter] < 0) {
456 printf("perfstat error: syscall returned with %d (%s)\n",
457 fd[0][counter], strerror(errno));
458 exit(-1);
463 int do_perfstat(int argc, char *argv[])
465 unsigned long long t0, t1;
466 int counter;
467 ssize_t res;
468 int status;
469 int pid;
471 if (!system_wide)
472 nr_cpus = 1;
474 for (counter = 0; counter < nr_counters; counter++)
475 create_perfstat_counter(counter);
477 argc -= optind;
478 argv += optind;
480 if (!argc)
481 display_help();
484 * Enable counters and exec the command:
486 t0 = rdclock();
487 prctl(PR_TASK_PERF_COUNTERS_ENABLE);
489 if ((pid = fork()) < 0)
490 perror("failed to fork");
491 if (!pid) {
492 if (execvp(argv[0], argv)) {
493 perror(argv[0]);
494 exit(-1);
497 while (wait(&status) >= 0)
499 prctl(PR_TASK_PERF_COUNTERS_DISABLE);
500 t1 = rdclock();
502 fflush(stdout);
504 fprintf(stderr, "\n");
505 fprintf(stderr, " Performance counter stats for \'%s\':\n",
506 argv[0]);
507 fprintf(stderr, "\n");
509 for (counter = 0; counter < nr_counters; counter++) {
510 int cpu;
511 __u64 count, single_count;
513 count = 0;
514 for (cpu = 0; cpu < nr_cpus; cpu ++) {
515 res = read(fd[cpu][counter],
516 (char *) &single_count, sizeof(single_count));
517 assert(res == sizeof(single_count));
518 count += single_count;
521 if (event_id[counter] == EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_CLOCK) ||
522 event_id[counter] == EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK)) {
524 double msecs = (double)count / 1000000;
526 fprintf(stderr, " %14.6f %-20s (msecs)\n",
527 msecs, event_name(counter));
528 } else {
529 fprintf(stderr, " %14Ld %-20s (events)\n",
530 count, event_name(counter));
533 fprintf(stderr, "\n");
534 fprintf(stderr, " Wall-clock time elapsed: %12.6f msecs\n",
535 (double)(t1-t0)/1e6);
536 fprintf(stderr, "\n");
538 return 0;
542 * Symbols
545 static uint64_t min_ip;
546 static uint64_t max_ip = -1ll;
548 struct sym_entry {
549 unsigned long long addr;
550 char *sym;
551 unsigned long count[MAX_COUNTERS];
552 int skip;
553 struct source_line *source;
556 #define MAX_SYMS 100000
558 static int sym_table_count;
560 struct sym_entry *sym_filter_entry;
562 static struct sym_entry sym_table[MAX_SYMS];
564 static void show_details(struct sym_entry *sym);
567 * Ordering weight: count-1 * count-2 * ... / count-n
569 static double sym_weight(const struct sym_entry *sym)
571 double weight;
572 int counter;
574 weight = sym->count[0];
576 for (counter = 1; counter < nr_counters-1; counter++)
577 weight *= sym->count[counter];
579 weight /= (sym->count[counter] + 1);
581 return weight;
584 static int compare(const void *__sym1, const void *__sym2)
586 const struct sym_entry *sym1 = __sym1, *sym2 = __sym2;
588 return sym_weight(sym1) < sym_weight(sym2);
591 static time_t last_refresh;
592 static long events;
593 static long userspace_events;
594 static const char CONSOLE_CLEAR[] = "\e[H\e[2J";
596 static struct sym_entry tmp[MAX_SYMS];
598 static void print_sym_table(void)
600 int i, printed;
601 int counter;
602 float events_per_sec = events/delay_secs;
603 float kevents_per_sec = (events-userspace_events)/delay_secs;
605 memcpy(tmp, sym_table, sizeof(sym_table[0])*sym_table_count);
606 qsort(tmp, sym_table_count, sizeof(tmp[0]), compare);
608 write(1, CONSOLE_CLEAR, strlen(CONSOLE_CLEAR));
610 printf(
611 "------------------------------------------------------------------------------\n");
612 printf( " KernelTop:%8.0f irqs/sec kernel:%3.1f%% [%s, ",
613 events_per_sec,
614 100.0 - (100.0*((events_per_sec-kevents_per_sec)/events_per_sec)),
615 nmi ? "NMI" : "IRQ");
617 if (nr_counters == 1)
618 printf("%d ", event_count[0]);
620 for (counter = 0; counter < nr_counters; counter++) {
621 if (counter)
622 printf("/");
624 printf("%s", event_name(counter));
627 printf( "], ");
629 if (tid != -1)
630 printf(" (tid: %d", tid);
631 else
632 printf(" (all");
634 if (profile_cpu != -1)
635 printf(", cpu: %d)\n", profile_cpu);
636 else {
637 if (tid != -1)
638 printf(")\n");
639 else
640 printf(", %d CPUs)\n", nr_cpus);
643 printf("------------------------------------------------------------------------------\n\n");
645 if (nr_counters == 1)
646 printf(" events");
647 else
648 printf(" weight events");
650 printf(" RIP kernel function\n"
651 " ______ ______ ________________ _______________\n\n"
654 printed = 0;
655 for (i = 0; i < sym_table_count; i++) {
656 int count;
658 if (nr_counters == 1) {
659 if (printed <= 18 &&
660 tmp[i].count[0] >= count_filter) {
661 printf("%19.2f - %016llx : %s\n",
662 sym_weight(tmp + i), tmp[i].addr, tmp[i].sym);
663 printed++;
665 } else {
666 if (printed <= 18 &&
667 tmp[i].count[0] >= count_filter) {
668 printf("%8.1f %10ld - %016llx : %s\n",
669 sym_weight(tmp + i),
670 tmp[i].count[0],
671 tmp[i].addr, tmp[i].sym);
672 printed++;
676 * Add decay to the counts:
678 for (count = 0; count < nr_counters; count++)
679 sym_table[i].count[count] = zero ? 0 : sym_table[i].count[count] * 7 / 8;
682 if (sym_filter_entry)
683 show_details(sym_filter_entry);
685 last_refresh = time(NULL);
688 struct pollfd stdin_poll = { .fd = 0, .events = POLLIN };
690 if (poll(&stdin_poll, 1, 0) == 1) {
691 printf("key pressed - exiting.\n");
692 exit(0);
697 static int read_symbol(FILE *in, struct sym_entry *s)
699 static int filter_match = 0;
700 char *sym, stype;
701 char str[500];
702 int rc, pos;
704 rc = fscanf(in, "%llx %c %499s", &s->addr, &stype, str);
705 if (rc == EOF)
706 return -1;
708 assert(rc == 3);
710 /* skip until end of line: */
711 pos = strlen(str);
712 do {
713 rc = fgetc(in);
714 if (rc == '\n' || rc == EOF || pos >= 499)
715 break;
716 str[pos] = rc;
717 pos++;
718 } while (1);
719 str[pos] = 0;
721 sym = str;
723 /* Filter out known duplicates and non-text symbols. */
724 if (!strcmp(sym, "_text"))
725 return 1;
726 if (!min_ip && !strcmp(sym, "_stext"))
727 return 1;
728 if (!strcmp(sym, "_etext") || !strcmp(sym, "_sinittext"))
729 return 1;
730 if (stype != 'T' && stype != 't')
731 return 1;
732 if (!strncmp("init_module", sym, 11) || !strncmp("cleanup_module", sym, 14))
733 return 1;
734 if (strstr(sym, "_text_start") || strstr(sym, "_text_end"))
735 return 1;
737 s->sym = malloc(strlen(str));
738 assert(s->sym);
740 strcpy((char *)s->sym, str);
741 s->skip = 0;
743 /* Tag events to be skipped. */
744 if (!strcmp("default_idle", s->sym) || !strcmp("cpu_idle", s->sym))
745 s->skip = 1;
746 else if (!strcmp("enter_idle", s->sym) || !strcmp("exit_idle", s->sym))
747 s->skip = 1;
748 else if (!strcmp("mwait_idle", s->sym))
749 s->skip = 1;
751 if (filter_match == 1) {
752 filter_end = s->addr;
753 filter_match = -1;
754 if (filter_end - filter_start > 10000) {
755 printf("hm, too large filter symbol <%s> - skipping.\n",
756 sym_filter);
757 printf("symbol filter start: %016lx\n", filter_start);
758 printf(" end: %016lx\n", filter_end);
759 filter_end = filter_start = 0;
760 sym_filter = NULL;
761 sleep(1);
764 if (filter_match == 0 && sym_filter && !strcmp(s->sym, sym_filter)) {
765 filter_match = 1;
766 filter_start = s->addr;
769 return 0;
772 int compare_addr(const void *__sym1, const void *__sym2)
774 const struct sym_entry *sym1 = __sym1, *sym2 = __sym2;
776 return sym1->addr > sym2->addr;
779 static void sort_symbol_table(void)
781 int i, dups;
783 do {
784 qsort(sym_table, sym_table_count, sizeof(sym_table[0]), compare_addr);
785 for (i = 0, dups = 0; i < sym_table_count; i++) {
786 if (sym_table[i].addr == sym_table[i+1].addr) {
787 sym_table[i+1].addr = -1ll;
788 dups++;
791 sym_table_count -= dups;
792 } while(dups);
795 static void parse_symbols(void)
797 struct sym_entry *last;
799 FILE *kallsyms = fopen("/proc/kallsyms", "r");
801 if (!kallsyms) {
802 printf("Could not open /proc/kallsyms - no CONFIG_KALLSYMS_ALL=y?\n");
803 exit(-1);
806 while (!feof(kallsyms)) {
807 if (read_symbol(kallsyms, &sym_table[sym_table_count]) == 0) {
808 sym_table_count++;
809 assert(sym_table_count <= MAX_SYMS);
813 sort_symbol_table();
814 min_ip = sym_table[0].addr;
815 max_ip = sym_table[sym_table_count-1].addr;
816 last = sym_table + sym_table_count++;
818 last->addr = -1ll;
819 last->sym = "<end>";
821 if (filter_end) {
822 int count;
823 for (count=0; count < sym_table_count; count ++) {
824 if (!strcmp(sym_table[count].sym, sym_filter)) {
825 sym_filter_entry = &sym_table[count];
826 break;
830 if (dump_symtab) {
831 int i;
833 for (i = 0; i < sym_table_count; i++)
834 fprintf(stderr, "%llx %s\n",
835 sym_table[i].addr, sym_table[i].sym);
840 * Source lines
843 static void parse_vmlinux(char *filename)
845 FILE *file;
846 char command[PATH_MAX*2];
847 if (!filename)
848 return;
850 sprintf(command, "objdump --start-address=0x%016lx --stop-address=0x%016lx -dS %s", filter_start, filter_end, filename);
852 file = popen(command, "r");
853 if (!file)
854 return;
856 lines_tail = &lines;
857 while (!feof(file)) {
858 struct source_line *src;
859 size_t dummy = 0;
860 char *c;
862 src = malloc(sizeof(struct source_line));
863 assert(src != NULL);
864 memset(src, 0, sizeof(struct source_line));
866 if (getline(&src->line, &dummy, file) < 0)
867 break;
868 if (!src->line)
869 break;
871 c = strchr(src->line, '\n');
872 if (c)
873 *c = 0;
875 src->next = NULL;
876 *lines_tail = src;
877 lines_tail = &src->next;
879 if (strlen(src->line)>8 && src->line[8] == ':')
880 src->EIP = strtoull(src->line, NULL, 16);
881 if (strlen(src->line)>8 && src->line[16] == ':')
882 src->EIP = strtoull(src->line, NULL, 16);
884 pclose(file);
887 static void record_precise_ip(uint64_t ip)
889 struct source_line *line;
891 for (line = lines; line; line = line->next) {
892 if (line->EIP == ip)
893 line->count++;
894 if (line->EIP > ip)
895 break;
899 static void lookup_sym_in_vmlinux(struct sym_entry *sym)
901 struct source_line *line;
902 char pattern[PATH_MAX];
903 sprintf(pattern, "<%s>:", sym->sym);
905 for (line = lines; line; line = line->next) {
906 if (strstr(line->line, pattern)) {
907 sym->source = line;
908 break;
913 static void show_lines(struct source_line *line_queue, int line_queue_count)
915 int i;
916 struct source_line *line;
918 line = line_queue;
919 for (i = 0; i < line_queue_count; i++) {
920 printf("%8li\t%s\n", line->count, line->line);
921 line = line->next;
925 #define TRACE_COUNT 3
927 static void show_details(struct sym_entry *sym)
929 struct source_line *line;
930 struct source_line *line_queue = NULL;
931 int displayed = 0;
932 int line_queue_count = 0;
934 if (!sym->source)
935 lookup_sym_in_vmlinux(sym);
936 if (!sym->source)
937 return;
939 printf("Showing details for %s\n", sym->sym);
941 line = sym->source;
942 while (line) {
943 if (displayed && strstr(line->line, ">:"))
944 break;
946 if (!line_queue_count)
947 line_queue = line;
948 line_queue_count ++;
950 if (line->count >= count_filter) {
951 show_lines(line_queue, line_queue_count);
952 line_queue_count = 0;
953 line_queue = NULL;
954 } else if (line_queue_count > TRACE_COUNT) {
955 line_queue = line_queue->next;
956 line_queue_count --;
959 line->count = 0;
960 displayed++;
961 if (displayed > 300)
962 break;
963 line = line->next;
968 * Binary search in the histogram table and record the hit:
970 static void record_ip(uint64_t ip, int counter)
972 int left_idx, middle_idx, right_idx, idx;
973 unsigned long left, middle, right;
975 record_precise_ip(ip);
977 left_idx = 0;
978 right_idx = sym_table_count-1;
979 assert(ip <= max_ip && ip >= min_ip);
981 while (left_idx + 1 < right_idx) {
982 middle_idx = (left_idx + right_idx) / 2;
984 left = sym_table[ left_idx].addr;
985 middle = sym_table[middle_idx].addr;
986 right = sym_table[ right_idx].addr;
988 if (!(left <= middle && middle <= right)) {
989 printf("%016lx...\n%016lx...\n%016lx\n", left, middle, right);
990 printf("%d %d %d\n", left_idx, middle_idx, right_idx);
992 assert(left <= middle && middle <= right);
993 if (!(left <= ip && ip <= right)) {
994 printf(" left: %016lx\n", left);
995 printf(" ip: %016lx\n", (unsigned long)ip);
996 printf("right: %016lx\n", right);
998 assert(left <= ip && ip <= right);
1000 * [ left .... target .... middle .... right ]
1001 * => right := middle
1003 if (ip < middle) {
1004 right_idx = middle_idx;
1005 continue;
1008 * [ left .... middle ... target ... right ]
1009 * => left := middle
1011 left_idx = middle_idx;
1014 idx = left_idx;
1016 if (!sym_table[idx].skip)
1017 sym_table[idx].count[counter]++;
1018 else events--;
1021 static void process_event(uint64_t ip, int counter)
1023 events++;
1025 if (ip < min_ip || ip > max_ip) {
1026 userspace_events++;
1027 return;
1030 record_ip(ip, counter);
1033 static void process_options(int argc, char *argv[])
1035 int error = 0, counter;
1037 if (strstr(argv[0], "perfstat"))
1038 run_perfstat = 1;
1040 for (;;) {
1041 int option_index = 0;
1042 /** Options for getopt */
1043 static struct option long_options[] = {
1044 {"count", required_argument, NULL, 'c'},
1045 {"cpu", required_argument, NULL, 'C'},
1046 {"delay", required_argument, NULL, 'd'},
1047 {"dump_symtab", no_argument, NULL, 'D'},
1048 {"event", required_argument, NULL, 'e'},
1049 {"filter", required_argument, NULL, 'f'},
1050 {"group", required_argument, NULL, 'g'},
1051 {"help", no_argument, NULL, 'h'},
1052 {"nmi", required_argument, NULL, 'n'},
1053 {"pid", required_argument, NULL, 'p'},
1054 {"vmlinux", required_argument, NULL, 'x'},
1055 {"symbol", required_argument, NULL, 's'},
1056 {"stat", no_argument, NULL, 'S'},
1057 {"zero", no_argument, NULL, 'z'},
1058 {"mmap_pages", required_argument, NULL, 'm'},
1059 {"mmap_info", no_argument, NULL, 'M'},
1060 {"munmap_info", no_argument, NULL, 'U'},
1061 {NULL, 0, NULL, 0 }
1063 int c = getopt_long(argc, argv, "+:ac:C:d:De:f:g:hn:m:p:s:Sx:zMU",
1064 long_options, &option_index);
1065 if (c == -1)
1066 break;
1068 switch (c) {
1069 case 'a': system_wide = 1; break;
1070 case 'c': default_interval = atoi(optarg); break;
1071 case 'C':
1072 /* CPU and PID are mutually exclusive */
1073 if (tid != -1) {
1074 printf("WARNING: CPU switch overriding PID\n");
1075 sleep(1);
1076 tid = -1;
1078 profile_cpu = atoi(optarg); break;
1079 case 'd': delay_secs = atoi(optarg); break;
1080 case 'D': dump_symtab = 1; break;
1082 case 'e': error = parse_events(optarg); break;
1084 case 'f': count_filter = atoi(optarg); break;
1085 case 'g': group = atoi(optarg); break;
1086 case 'h': display_help(); break;
1087 case 'n': nmi = atoi(optarg); break;
1088 case 'p':
1089 /* CPU and PID are mutually exclusive */
1090 if (profile_cpu != -1) {
1091 printf("WARNING: PID switch overriding CPU\n");
1092 sleep(1);
1093 profile_cpu = -1;
1095 tid = atoi(optarg); break;
1096 case 's': sym_filter = strdup(optarg); break;
1097 case 'S': run_perfstat = 1; break;
1098 case 'x': vmlinux = strdup(optarg); break;
1099 case 'z': zero = 1; break;
1100 case 'm': mmap_pages = atoi(optarg); break;
1101 case 'M': use_mmap = 1; break;
1102 case 'U': use_munmap = 1; break;
1103 default: error = 1; break;
1106 if (error)
1107 display_help();
1109 if (!nr_counters) {
1110 if (run_perfstat)
1111 nr_counters = 8;
1112 else {
1113 nr_counters = 1;
1114 event_id[0] = 0;
1118 for (counter = 0; counter < nr_counters; counter++) {
1119 if (event_count[counter])
1120 continue;
1122 event_count[counter] = default_interval;
1126 struct mmap_data {
1127 int counter;
1128 void *base;
1129 unsigned int mask;
1130 unsigned int prev;
1133 static unsigned int mmap_read_head(struct mmap_data *md)
1135 struct perf_counter_mmap_page *pc = md->base;
1136 int head;
1138 head = pc->data_head;
1139 rmb();
1141 return head;
1144 struct timeval last_read, this_read;
1146 static void mmap_read(struct mmap_data *md)
1148 unsigned int head = mmap_read_head(md);
1149 unsigned int old = md->prev;
1150 unsigned char *data = md->base + page_size;
1151 int diff;
1153 gettimeofday(&this_read, NULL);
1156 * If we're further behind than half the buffer, there's a chance
1157 * the writer will bite our tail and screw up the events under us.
1159 * If we somehow ended up ahead of the head, we got messed up.
1161 * In either case, truncate and restart at head.
1163 diff = head - old;
1164 if (diff > md->mask / 2 || diff < 0) {
1165 struct timeval iv;
1166 unsigned long msecs;
1168 timersub(&this_read, &last_read, &iv);
1169 msecs = iv.tv_sec*1000 + iv.tv_usec/1000;
1171 fprintf(stderr, "WARNING: failed to keep up with mmap data."
1172 " Last read %lu msecs ago.\n", msecs);
1175 * head points to a known good entry, start there.
1177 old = head;
1180 last_read = this_read;
1182 for (; old != head;) {
1183 struct ip_event {
1184 struct perf_event_header header;
1185 __u64 ip;
1186 __u32 pid, tid;
1188 struct mmap_event {
1189 struct perf_event_header header;
1190 __u32 pid, tid;
1191 __u64 start;
1192 __u64 len;
1193 __u64 pgoff;
1194 char filename[PATH_MAX];
1197 typedef union event_union {
1198 struct perf_event_header header;
1199 struct ip_event ip;
1200 struct mmap_event mmap;
1201 } event_t;
1203 event_t *event = (event_t *)&data[old & md->mask];
1205 event_t event_copy;
1207 unsigned int size = event->header.size;
1210 * Event straddles the mmap boundary -- header should always
1211 * be inside due to u64 alignment of output.
1213 if ((old & md->mask) + size != ((old + size) & md->mask)) {
1214 unsigned int offset = old;
1215 unsigned int len = min(sizeof(*event), size), cpy;
1216 void *dst = &event_copy;
1218 do {
1219 cpy = min(md->mask + 1 - (offset & md->mask), len);
1220 memcpy(dst, &data[offset & md->mask], cpy);
1221 offset += cpy;
1222 dst += cpy;
1223 len -= cpy;
1224 } while (len);
1226 event = &event_copy;
1229 old += size;
1231 switch (event->header.type) {
1232 case PERF_EVENT_IP:
1233 case PERF_EVENT_IP | __PERF_EVENT_TID:
1234 process_event(event->ip.ip, md->counter);
1235 break;
1237 case PERF_EVENT_MMAP:
1238 case PERF_EVENT_MUNMAP:
1239 printf("%s: %Lu %Lu %Lu %s\n",
1240 event->header.type == PERF_EVENT_MMAP
1241 ? "mmap" : "munmap",
1242 event->mmap.start,
1243 event->mmap.len,
1244 event->mmap.pgoff,
1245 event->mmap.filename);
1246 break;
1250 md->prev = old;
1253 int main(int argc, char *argv[])
1255 struct pollfd event_array[MAX_NR_CPUS * MAX_COUNTERS];
1256 struct mmap_data mmap_array[MAX_NR_CPUS][MAX_COUNTERS];
1257 struct perf_counter_hw_event hw_event;
1258 int i, counter, group_fd, nr_poll = 0;
1259 unsigned int cpu;
1260 int ret;
1262 page_size = sysconf(_SC_PAGE_SIZE);
1264 process_options(argc, argv);
1266 nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
1267 assert(nr_cpus <= MAX_NR_CPUS);
1268 assert(nr_cpus >= 0);
1270 if (run_perfstat)
1271 return do_perfstat(argc, argv);
1273 if (tid != -1 || profile_cpu != -1)
1274 nr_cpus = 1;
1276 parse_symbols();
1277 if (vmlinux && sym_filter_entry)
1278 parse_vmlinux(vmlinux);
1280 for (i = 0; i < nr_cpus; i++) {
1281 group_fd = -1;
1282 for (counter = 0; counter < nr_counters; counter++) {
1284 cpu = profile_cpu;
1285 if (tid == -1 && profile_cpu == -1)
1286 cpu = i;
1288 memset(&hw_event, 0, sizeof(hw_event));
1289 hw_event.config = event_id[counter];
1290 hw_event.irq_period = event_count[counter];
1291 hw_event.record_type = PERF_RECORD_IRQ;
1292 hw_event.nmi = nmi;
1293 hw_event.include_tid = 1;
1294 hw_event.mmap = use_mmap;
1295 hw_event.munmap = use_munmap;
1297 fd[i][counter] = sys_perf_counter_open(&hw_event, tid, cpu, group_fd, 0);
1298 if (fd[i][counter] < 0) {
1299 int err = errno;
1300 printf("kerneltop error: syscall returned with %d (%s)\n",
1301 fd[i][counter], strerror(err));
1302 if (err == EPERM)
1303 printf("Are you root?\n");
1304 exit(-1);
1306 assert(fd[i][counter] >= 0);
1307 fcntl(fd[i][counter], F_SETFL, O_NONBLOCK);
1310 * First counter acts as the group leader:
1312 if (group && group_fd == -1)
1313 group_fd = fd[i][counter];
1315 event_array[nr_poll].fd = fd[i][counter];
1316 event_array[nr_poll].events = POLLIN;
1317 nr_poll++;
1319 mmap_array[i][counter].counter = counter;
1320 mmap_array[i][counter].prev = 0;
1321 mmap_array[i][counter].mask = mmap_pages*page_size - 1;
1322 mmap_array[i][counter].base = mmap(NULL, (mmap_pages+1)*page_size,
1323 PROT_READ, MAP_SHARED, fd[i][counter], 0);
1324 if (mmap_array[i][counter].base == MAP_FAILED) {
1325 printf("kerneltop error: failed to mmap with %d (%s)\n",
1326 errno, strerror(errno));
1327 exit(-1);
1332 printf("KernelTop refresh period: %d seconds\n", delay_secs);
1333 last_refresh = time(NULL);
1335 while (1) {
1336 int hits = events;
1338 for (i = 0; i < nr_cpus; i++) {
1339 for (counter = 0; counter < nr_counters; counter++)
1340 mmap_read(&mmap_array[i][counter]);
1343 if (time(NULL) >= last_refresh + delay_secs) {
1344 print_sym_table();
1345 events = userspace_events = 0;
1348 if (hits == events)
1349 ret = poll(event_array, nr_poll, 1000);
1350 hits = events;
1353 return 0;