2 * kerneltop.c: show top kernel functions - performance counters showcase
6 cc -O6 -Wall `pkg-config --cflags --libs glib-2.0` -o kerneltop kerneltop.c
10 ------------------------------------------------------------------------------
11 KernelTop: 2669 irqs/sec [NMI, cache-misses/cache-refs], (all, cpu: 2)
12 ------------------------------------------------------------------------------
14 weight RIP kernel function
15 ______ ________________ _______________
17 35.20 - ffffffff804ce74b : skb_copy_and_csum_dev
18 33.00 - ffffffff804cb740 : sock_alloc_send_skb
19 31.26 - ffffffff804ce808 : skb_push
20 22.43 - ffffffff80510004 : tcp_established_options
21 19.00 - ffffffff8027d250 : find_get_page
22 15.76 - ffffffff804e4fc9 : eth_type_trans
23 15.20 - ffffffff804d8baa : dst_release
24 14.86 - ffffffff804cf5d8 : skb_release_head_state
25 14.00 - ffffffff802217d5 : read_hpet
26 12.00 - ffffffff804ffb7f : __ip_local_out
27 11.97 - ffffffff804fc0c8 : ip_local_deliver_finish
28 8.54 - ffffffff805001a3 : ip_queue_xmit
30 Started by Ingo Molnar <mingo@redhat.com>
32 Improvements and fixes by:
34 Arjan van de Ven <arjan@linux.intel.com>
35 Yanmin Zhang <yanmin.zhang@intel.com>
36 Mike Galbraith <efault@gmx.de>
38 Released under the GPL v2. (and only v2, not any later version)
42 #include <sys/types.h>
59 #include <sys/syscall.h>
60 #include <sys/ioctl.h>
62 #include <sys/prctl.h>
66 #include <linux/unistd.h>
69 # define __NR_perf_counter_open 295
73 # define __NR_perf_counter_open 333
77 * Pick up some kernel type conventions:
82 typedef unsigned int __u32
;
83 typedef unsigned long long __u64
;
84 typedef long long __s64
;
87 * User-space ABI bits:
91 * Generalized performance counter event types, used by the hw_event.type
92 * parameter of the sys_perf_counter_open() syscall:
96 * Common hardware events, generalized by the kernel:
98 PERF_COUNT_CPU_CYCLES
= 0,
99 PERF_COUNT_INSTRUCTIONS
= 1,
100 PERF_COUNT_CACHE_REFERENCES
= 2,
101 PERF_COUNT_CACHE_MISSES
= 3,
102 PERF_COUNT_BRANCH_INSTRUCTIONS
= 4,
103 PERF_COUNT_BRANCH_MISSES
= 5,
104 PERF_COUNT_BUS_CYCLES
= 6,
106 PERF_HW_EVENTS_MAX
= 7,
109 * Special "software" counters provided by the kernel, even if
110 * the hardware does not support performance counters. These
111 * counters measure various physical and sw events of the
112 * kernel (and allow the profiling of them as well):
114 PERF_COUNT_CPU_CLOCK
= -1,
115 PERF_COUNT_TASK_CLOCK
= -2,
116 PERF_COUNT_PAGE_FAULTS
= -3,
117 PERF_COUNT_CONTEXT_SWITCHES
= -4,
118 PERF_COUNT_CPU_MIGRATIONS
= -5,
120 PERF_SW_EVENTS_MIN
= -6,
124 * IRQ-notification data record type:
126 enum perf_counter_record_type
{
127 PERF_RECORD_SIMPLE
= 0,
129 PERF_RECORD_GROUP
= 2,
133 * Hardware event to monitor via a performance monitoring counter:
135 struct perf_counter_hw_event
{
142 __u64 disabled
: 1, /* off by default */
143 nmi
: 1, /* NMI sampling */
144 raw
: 1, /* raw event type */
145 inherit
: 1, /* children inherit it */
146 pinned
: 1, /* must always be on PMU */
147 exclusive
: 1, /* only group on PMU */
148 exclude_user
: 1, /* don't count user */
149 exclude_kernel
: 1, /* ditto kernel */
150 exclude_hv
: 1, /* ditto hypervisor */
151 exclude_idle
: 1, /* don't count when idle */
155 __u32 extra_config_len
;
163 * Ioctls that can be done on a perf counter fd:
165 #define PERF_COUNTER_IOC_ENABLE _IO('$', 0)
166 #define PERF_COUNTER_IOC_DISABLE _IO('$', 1)
168 asmlinkage
int sys_perf_counter_open(
170 struct perf_counter_hw_event
*hw_event_uptr __user
,
179 __NR_perf_counter_open
, hw_event_uptr
, pid
, cpu
, group_fd
, flags
);
180 #if defined(__x86_64__) || defined(__i386__)
181 if (ret
< 0 && ret
> -4096) {
189 const char *event_types
[] = {
199 const unsigned int default_count
[] = {
209 * prctl(PR_TASK_PERF_COUNTERS_DISABLE) will (cheaply) disable all
210 * counters in the current task.
212 #define PR_TASK_PERF_COUNTERS_DISABLE 31
213 #define PR_TASK_PERF_COUNTERS_ENABLE 32
215 #define MAX_COUNTERS 8
217 static int nr_counters
= -1;
219 static __u64 count_filter
= 100;
221 #define MAX_NR_CPUS 256
223 static int event_count
[MAX_COUNTERS
];
224 static unsigned long event_id
[MAX_COUNTERS
];
225 static int event_raw
[MAX_COUNTERS
];
228 static int profile_cpu
= -1;
229 static int nr_cpus
= 0;
231 static int group
= 0;
233 static char *vmlinux
;
235 static char *sym_filter
;
236 static unsigned long filter_start
;
237 static unsigned long filter_end
;
239 static int delay_secs
= 2;
241 static int dump_symtab
;
251 static void display_help(void)
254 "Usage: kerneltop [<options>]\n\n"
255 "KernelTop Options (up to %d event types can be specified at once):\n\n",
258 " -e EID --event_id=EID # event type ID [default: 0]\n"
261 " 2: cache accesses\n"
263 " 4: branch instructions\n"
264 " 5: branch prediction misses\n"
266 " rNNN: raw PMU events (eventsel+umask)\n\n"
267 " -c CNT --count=CNT # event period to sample\n\n"
268 " -C CPU --cpu=CPU # CPU (-1 for all) [default: -1]\n"
269 " -p PID --pid=PID # PID of sampled task (-1 for all) [default: -1]\n\n"
270 " -d delay --delay=<seconds> # sampling/display delay [default: 2]\n"
271 " -f CNT --filter=CNT # min-event-count filter [default: 100]\n\n"
272 " -s symbol --symbol=<symbol> # function to be showed annotated one-shot\n"
273 " -x path --vmlinux=<path> # the vmlinux binary, required for -s use:\n"
274 " -z --zero # zero counts after display\n"
275 " -D --dump_symtab # dump symbol table to stderr on startup\n"
281 static void process_options(int argc
, char *argv
[])
283 int error
= 0, counter
;
286 int option_index
= 0;
287 /** Options for getopt */
288 static struct option long_options
[] = {
289 {"count", required_argument
, NULL
, 'c'},
290 {"cpu", required_argument
, NULL
, 'C'},
291 {"delay", required_argument
, NULL
, 'd'},
292 {"dump_symtab", no_argument
, NULL
, 'D'},
293 {"event_id", required_argument
, NULL
, 'e'},
294 {"filter", required_argument
, NULL
, 'f'},
295 {"group", required_argument
, NULL
, 'g'},
296 {"help", no_argument
, NULL
, 'h'},
297 {"nmi", required_argument
, NULL
, 'n'},
298 {"pid", required_argument
, NULL
, 'p'},
299 {"vmlinux", required_argument
, NULL
, 'x'},
300 {"symbol", required_argument
, NULL
, 's'},
301 {"zero", no_argument
, NULL
, 'z'},
304 int c
= getopt_long(argc
, argv
, "c:C:d:De:f:g:hn:p:s:x:z",
305 long_options
, &option_index
);
311 if (nr_counters
== -1)
313 event_count
[nr_counters
] = atoi(optarg
); break;
315 /* CPU and PID are mutually exclusive */
317 printf("WARNING: CPU switch overriding PID\n");
321 profile_cpu
= atoi(optarg
); break;
322 case 'd': delay_secs
= atoi(optarg
); break;
323 case 'D': dump_symtab
= 1; break;
327 if (nr_counters
== MAX_COUNTERS
) {
331 if (*optarg
== 'r') {
332 event_raw
[nr_counters
] = 1;
335 event_id
[nr_counters
] = strtol(optarg
, NULL
, 16);
338 case 'f': count_filter
= atoi(optarg
); break;
339 case 'g': group
= atoi(optarg
); break;
340 case 'h': display_help(); break;
341 case 'n': nmi
= atoi(optarg
); break;
343 /* CPU and PID are mutually exclusive */
344 if (profile_cpu
!= -1) {
345 printf("WARNING: PID switch overriding CPU\n");
349 tid
= atoi(optarg
); break;
350 case 's': sym_filter
= strdup(optarg
); break;
351 case 'x': vmlinux
= strdup(optarg
); break;
352 case 'z': zero
= 1; break;
353 default: error
= 1; break;
363 for (counter
= 0; counter
< nr_counters
; counter
++) {
364 if (event_count
[counter
])
367 if (event_id
[counter
] < PERF_HW_EVENTS_MAX
)
368 event_count
[counter
] = default_count
[event_id
[counter
]];
370 event_count
[counter
] = 100000;
374 static uint64_t min_ip
;
375 static uint64_t max_ip
= -1ll;
378 unsigned long long addr
;
380 unsigned long count
[MAX_COUNTERS
];
385 #define MAX_SYMS 100000
387 static int sym_table_count
;
389 struct sym_entry
*sym_filter_entry
;
391 static struct sym_entry sym_table
[MAX_SYMS
];
393 static void show_details(struct sym_entry
*sym
);
396 * Ordering weight: count-1 * count-1 * ... / count-n
398 static double sym_weight(const struct sym_entry
*sym
)
403 weight
= sym
->count
[0];
405 for (counter
= 1; counter
< nr_counters
-1; counter
++)
406 weight
*= sym
->count
[counter
];
408 weight
/= (sym
->count
[counter
] + 1);
413 static int compare(const void *__sym1
, const void *__sym2
)
415 const struct sym_entry
*sym1
= __sym1
, *sym2
= __sym2
;
417 return sym_weight(sym1
) < sym_weight(sym2
);
420 static time_t last_refresh
;
422 static long userspace_events
;
423 static const char CONSOLE_CLEAR
[] = "\e[H\e[2J";
425 static struct sym_entry tmp
[MAX_SYMS
];
427 static void print_sym_table(void)
431 float events_per_sec
= events
/delay_secs
;
432 float kevents_per_sec
= (events
-userspace_events
)/delay_secs
;
434 memcpy(tmp
, sym_table
, sizeof(sym_table
[0])*sym_table_count
);
435 qsort(tmp
, sym_table_count
, sizeof(tmp
[0]), compare
);
437 write(1, CONSOLE_CLEAR
, strlen(CONSOLE_CLEAR
));
440 "------------------------------------------------------------------------------\n");
441 printf( " KernelTop:%8.0f irqs/sec kernel:%3.1f%% [%s, ",
443 100.0 - (100.0*((events_per_sec
-kevents_per_sec
)/events_per_sec
)),
444 nmi
? "NMI" : "IRQ");
446 if (nr_counters
== 1)
447 printf("%d ", event_count
[0]);
449 for (counter
= 0; counter
< nr_counters
; counter
++) {
453 if (event_id
[counter
] < PERF_HW_EVENTS_MAX
)
454 printf( "%s", event_types
[event_id
[counter
]]);
456 printf( "raw:%04lx", event_id
[counter
]);
462 printf(" (tid: %d", tid
);
466 if (profile_cpu
!= -1)
467 printf(", cpu: %d)\n", profile_cpu
);
472 printf(", %d CPUs)\n", nr_cpus
);
475 printf("------------------------------------------------------------------------------\n\n");
477 if (nr_counters
== 1)
480 printf(" weight events");
482 printf(" RIP kernel function\n"
483 " ______ ______ ________________ _______________\n\n"
487 for (i
= 0; i
< sym_table_count
; i
++) {
490 if (nr_counters
== 1) {
492 tmp
[i
].count
[0] >= count_filter
) {
493 printf("%19.2f - %016llx : %s\n",
494 sym_weight(tmp
+ i
), tmp
[i
].addr
, tmp
[i
].sym
);
499 tmp
[i
].count
[0] >= count_filter
) {
500 printf("%8.1f %10ld - %016llx : %s\n",
503 tmp
[i
].addr
, tmp
[i
].sym
);
508 * Add decay to the counts:
510 for (count
= 0; count
< nr_counters
; count
++)
511 sym_table
[i
].count
[count
] = zero
? 0 : sym_table
[i
].count
[count
] * 7 / 8;
514 if (sym_filter_entry
)
515 show_details(sym_filter_entry
);
517 last_refresh
= time(NULL
);
520 struct pollfd stdin_poll
= { .fd
= 0, .events
= POLLIN
};
522 if (poll(&stdin_poll
, 1, 0) == 1) {
523 printf("key pressed - exiting.\n");
529 static int read_symbol(FILE *in
, struct sym_entry
*s
)
531 static int filter_match
= 0;
536 rc
= fscanf(in
, "%llx %c %499s", &s
->addr
, &stype
, str
);
542 /* skip until end of line: */
546 if (rc
== '\n' || rc
== EOF
|| pos
>= 499)
555 /* Filter out known duplicates and non-text symbols. */
556 if (!strcmp(sym
, "_text"))
558 if (!min_ip
&& !strcmp(sym
, "_stext"))
560 if (!strcmp(sym
, "_etext") || !strcmp(sym
, "_sinittext"))
562 if (stype
!= 'T' && stype
!= 't')
564 if (!strncmp("init_module", sym
, 11) || !strncmp("cleanup_module", sym
, 14))
566 if (strstr(sym
, "_text_start") || strstr(sym
, "_text_end"))
569 s
->sym
= malloc(strlen(str
));
572 strcpy((char *)s
->sym
, str
);
575 /* Tag events to be skipped. */
576 if (!strcmp("default_idle", s
->sym
) || !strcmp("cpu_idle", s
->sym
))
578 if (!strcmp("enter_idle", s
->sym
) || !strcmp("exit_idle", s
->sym
))
581 if (filter_match
== 1) {
582 filter_end
= s
->addr
;
584 if (filter_end
- filter_start
> 10000) {
585 printf("hm, too large filter symbol <%s> - skipping.\n",
587 printf("symbol filter start: %016lx\n", filter_start
);
588 printf(" end: %016lx\n", filter_end
);
589 filter_end
= filter_start
= 0;
594 if (filter_match
== 0 && sym_filter
&& !strcmp(s
->sym
, sym_filter
)) {
596 filter_start
= s
->addr
;
602 int compare_addr(const void *__sym1
, const void *__sym2
)
604 const struct sym_entry
*sym1
= __sym1
, *sym2
= __sym2
;
606 return sym1
->addr
> sym2
->addr
;
609 static void sort_symbol_table(void)
614 qsort(sym_table
, sym_table_count
, sizeof(sym_table
[0]), compare_addr
);
615 for (i
= 0, dups
= 0; i
< sym_table_count
; i
++) {
616 if (sym_table
[i
].addr
== sym_table
[i
+1].addr
) {
617 sym_table
[i
+1].addr
= -1ll;
621 sym_table_count
-= dups
;
625 static void parse_symbols(void)
627 struct sym_entry
*last
;
629 FILE *kallsyms
= fopen("/proc/kallsyms", "r");
632 printf("Could not open /proc/kallsyms - no CONFIG_KALLSYMS_ALL=y?\n");
636 while (!feof(kallsyms
)) {
637 if (read_symbol(kallsyms
, &sym_table
[sym_table_count
]) == 0) {
639 assert(sym_table_count
<= MAX_SYMS
);
644 min_ip
= sym_table
[0].addr
;
645 max_ip
= sym_table
[sym_table_count
-1].addr
;
646 last
= sym_table
+ sym_table_count
++;
653 for (count
=0; count
< sym_table_count
; count
++) {
654 if (!strcmp(sym_table
[count
].sym
, sym_filter
)) {
655 sym_filter_entry
= &sym_table
[count
];
663 for (i
= 0; i
< sym_table_count
; i
++)
664 fprintf(stderr
, "%llx %s\n",
665 sym_table
[i
].addr
, sym_table
[i
].sym
);
670 static void parse_vmlinux(char *filename
)
673 char command
[PATH_MAX
*2];
677 sprintf(command
, "objdump --start-address=0x%016lx --stop-address=0x%016lx -dS %s", filter_start
, filter_end
, filename
);
679 file
= popen(command
, "r");
683 while (!feof(file
)) {
684 struct source_line
*src
;
688 src
= malloc(sizeof(struct source_line
));
690 memset(src
, 0, sizeof(struct source_line
));
692 if (getline(&src
->line
, &dummy
, file
) < 0)
697 c
= strchr(src
->line
, '\n');
701 lines
= g_list_prepend(lines
, src
);
703 if (strlen(src
->line
)>8 && src
->line
[8] == ':')
704 src
->EIP
= strtoull(src
->line
, NULL
, 16);
705 if (strlen(src
->line
)>8 && src
->line
[16] == ':')
706 src
->EIP
= strtoull(src
->line
, NULL
, 16);
709 lines
= g_list_reverse(lines
);
712 static void record_precise_ip(uint64_t ip
)
714 struct source_line
*line
;
717 item
= g_list_first(lines
);
724 item
= g_list_next(item
);
728 static void lookup_sym_in_vmlinux(struct sym_entry
*sym
)
730 struct source_line
*line
;
732 char pattern
[PATH_MAX
];
733 sprintf(pattern
, "<%s>:", sym
->sym
);
735 item
= g_list_first(lines
);
738 if (strstr(line
->line
, pattern
)) {
742 item
= g_list_next(item
);
746 void show_lines(GList
*item_queue
, int item_queue_count
)
749 struct source_line
*line
;
751 for (i
= 0; i
< item_queue_count
; i
++) {
752 line
= item_queue
->data
;
753 printf("%8li\t%s\n", line
->count
, line
->line
);
754 item_queue
= g_list_next(item_queue
);
758 #define TRACE_COUNT 3
760 static void show_details(struct sym_entry
*sym
)
762 struct source_line
*line
;
765 GList
*item_queue
= NULL
;
766 int item_queue_count
= 0;
769 lookup_sym_in_vmlinux(sym
);
773 printf("Showing details for %s\n", sym
->sym
);
778 if (displayed
&& strstr(line
->line
, ">:"))
781 if (!item_queue_count
)
785 if (line
->count
>= count_filter
) {
786 show_lines(item_queue
, item_queue_count
);
787 item_queue_count
= 0;
789 } else if (item_queue_count
> TRACE_COUNT
) {
790 item_queue
= g_list_next(item_queue
);
798 item
= g_list_next(item
);
803 * Binary search in the histogram table and record the hit:
805 static void record_ip(uint64_t ip
, int counter
)
807 int left_idx
, middle_idx
, right_idx
, idx
;
808 unsigned long left
, middle
, right
;
810 record_precise_ip(ip
);
813 right_idx
= sym_table_count
-1;
814 assert(ip
<= max_ip
&& ip
>= min_ip
);
816 while (left_idx
+ 1 < right_idx
) {
817 middle_idx
= (left_idx
+ right_idx
) / 2;
819 left
= sym_table
[ left_idx
].addr
;
820 middle
= sym_table
[middle_idx
].addr
;
821 right
= sym_table
[ right_idx
].addr
;
823 if (!(left
<= middle
&& middle
<= right
)) {
824 printf("%016lx...\n%016lx...\n%016lx\n", left
, middle
, right
);
825 printf("%d %d %d\n", left_idx
, middle_idx
, right_idx
);
827 assert(left
<= middle
&& middle
<= right
);
828 if (!(left
<= ip
&& ip
<= right
)) {
829 printf(" left: %016lx\n", left
);
830 printf(" ip: %016lx\n", ip
);
831 printf("right: %016lx\n", right
);
833 assert(left
<= ip
&& ip
<= right
);
835 * [ left .... target .... middle .... right ]
839 right_idx
= middle_idx
;
843 * [ left .... middle ... target ... right ]
846 left_idx
= middle_idx
;
851 if (!sym_table
[idx
].skip
)
852 sym_table
[idx
].count
[counter
]++;
856 static void process_event(uint64_t ip
, int counter
)
860 if (ip
< min_ip
|| ip
> max_ip
) {
865 record_ip(ip
, counter
);
868 int main(int argc
, char *argv
[])
870 struct pollfd event_array
[MAX_NR_CPUS
][MAX_COUNTERS
];
871 struct perf_counter_hw_event hw_event
;
872 int fd
[MAX_NR_CPUS
][MAX_COUNTERS
];
873 int i
, counter
, group_fd
;
879 process_options(argc
, argv
);
881 nr_cpus
= sysconf(_SC_NPROCESSORS_ONLN
);
882 if (tid
!= -1 || profile_cpu
!= -1)
885 assert(nr_cpus
<= MAX_NR_CPUS
);
887 for (i
= 0; i
< nr_cpus
; i
++) {
889 for (counter
= 0; counter
< nr_counters
; counter
++) {
892 if (tid
== -1 && profile_cpu
== -1)
895 memset(&hw_event
, 0, sizeof(hw_event
));
896 hw_event
.type
= event_id
[counter
];
897 hw_event
.raw
= event_raw
[counter
];
898 hw_event
.irq_period
= event_count
[counter
];
899 hw_event
.record_type
= PERF_RECORD_IRQ
;
902 fd
[i
][counter
] = sys_perf_counter_open(&hw_event
, tid
, cpu
, group_fd
, 0);
903 fcntl(fd
[i
][counter
], F_SETFL
, O_NONBLOCK
);
904 if (fd
[i
][counter
] < 0) {
905 printf("kerneltop error: syscall returned with %d (%s)\n",
906 fd
[i
][counter
], strerror(-fd
[i
][counter
]));
907 if (fd
[i
][counter
] == -1)
908 printf("Are you root?\n");
911 assert(fd
[i
][counter
] >= 0);
914 * First counter acts as the group leader:
916 if (group
&& group_fd
== -1)
917 group_fd
= fd
[i
][counter
];
919 event_array
[i
][counter
].fd
= fd
[i
][counter
];
920 event_array
[i
][counter
].events
= POLLIN
;
925 if (vmlinux
&& sym_filter_entry
)
926 parse_vmlinux(vmlinux
);
928 printf("KernelTop refresh period: %d seconds\n", delay_secs
);
929 last_refresh
= time(NULL
);
934 for (i
= 0; i
< nr_cpus
; i
++) {
935 for (counter
= 0; counter
< nr_counters
; counter
++) {
936 res
= read(fd
[i
][counter
], (char *) &ip
, sizeof(ip
));
938 assert(res
== sizeof(ip
));
940 process_event(ip
, counter
);
945 if (time(NULL
) >= last_refresh
+ delay_secs
) {
947 events
= userspace_events
= 0;
951 ret
= poll(event_array
[0], nr_cpus
, 1000);