2 * kerneltop.c: show top kernel functions - performance counters showcase
6 cc -O6 -Wall -c -o kerneltop.o kerneltop.c -lrt
10 ------------------------------------------------------------------------------
11 KernelTop: 2669 irqs/sec [NMI, cache-misses/cache-refs], (all, cpu: 2)
12 ------------------------------------------------------------------------------
14 weight RIP kernel function
15 ______ ________________ _______________
17 35.20 - ffffffff804ce74b : skb_copy_and_csum_dev
18 33.00 - ffffffff804cb740 : sock_alloc_send_skb
19 31.26 - ffffffff804ce808 : skb_push
20 22.43 - ffffffff80510004 : tcp_established_options
21 19.00 - ffffffff8027d250 : find_get_page
22 15.76 - ffffffff804e4fc9 : eth_type_trans
23 15.20 - ffffffff804d8baa : dst_release
24 14.86 - ffffffff804cf5d8 : skb_release_head_state
25 14.00 - ffffffff802217d5 : read_hpet
26 12.00 - ffffffff804ffb7f : __ip_local_out
27 11.97 - ffffffff804fc0c8 : ip_local_deliver_finish
28 8.54 - ffffffff805001a3 : ip_queue_xmit
32 * perfstat: /usr/bin/time -alike performance counter statistics utility
34 It summarizes the counter events of all tasks (and child tasks),
35 covering all CPUs that the command (or workload) executes on.
36 It only counts the per-task events of the workload started,
37 independent of how many other tasks run on those CPUs.
41 $ ./perfstat -e 1 -e 3 -e 5 ls -lR /usr/include/ >/dev/null
43 Performance counter stats for 'ls':
45 163516953 instructions
51 * Copyright (C) 2008, Red Hat Inc, Ingo Molnar <mingo@redhat.com>
53 * Improvements and fixes by:
55 * Arjan van de Ven <arjan@linux.intel.com>
56 * Yanmin Zhang <yanmin.zhang@intel.com>
57 * Wu Fengguang <fengguang.wu@intel.com>
58 * Mike Galbraith <efault@gmx.de>
59 * Paul Mackerras <paulus@samba.org>
61 * Released under the GPL v2. (and only v2, not any later version)
65 #include <sys/types.h>
81 #include <sys/syscall.h>
82 #include <sys/ioctl.h>
84 #include <sys/prctl.h>
89 #include <linux/unistd.h>
90 #include <linux/types.h>
92 #include "../../include/linux/perf_counter.h"
96 * prctl(PR_TASK_PERF_COUNTERS_DISABLE) will (cheaply) disable all
97 * counters in the current task.
99 #define PR_TASK_PERF_COUNTERS_DISABLE 31
100 #define PR_TASK_PERF_COUNTERS_ENABLE 32
102 #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
106 struct timespec ts; \
108 clock_gettime(CLOCK_MONOTONIC, &ts); \
109 ts.tv_sec * 1000000000ULL + ts.tv_nsec; \
113 * Pick up some kernel type conventions:
119 #define __NR_perf_counter_open 295
120 #define rmb() asm volatile("lfence" ::: "memory")
121 #define cpu_relax() asm volatile("rep; nop" ::: "memory");
125 #define __NR_perf_counter_open 333
126 #define rmb() asm volatile("lfence" ::: "memory")
127 #define cpu_relax() asm volatile("rep; nop" ::: "memory");
131 #define __NR_perf_counter_open 319
132 #define rmb() asm volatile ("sync" ::: "memory")
133 #define cpu_relax() asm volatile ("" ::: "memory");
136 #define unlikely(x) __builtin_expect(!!(x), 0)
137 #define min(x, y) ({ \
138 typeof(x) _min1 = (x); \
139 typeof(y) _min2 = (y); \
140 (void) (&_min1 == &_min2); \
141 _min1 < _min2 ? _min1 : _min2; })
143 asmlinkage
int sys_perf_counter_open(
144 struct perf_counter_hw_event
*hw_event_uptr __user
,
151 __NR_perf_counter_open
, hw_event_uptr
, pid
, cpu
, group_fd
, flags
);
154 #define MAX_COUNTERS 64
155 #define MAX_NR_CPUS 256
157 #define EID(type, id) (((__u64)(type) << PERF_COUNTER_TYPE_SHIFT) | (id))
159 static int run_perfstat
= 0;
160 static int system_wide
= 0;
162 static int nr_counters
= 0;
163 static __u64 event_id
[MAX_COUNTERS
] = {
164 EID(PERF_TYPE_SOFTWARE
, PERF_COUNT_TASK_CLOCK
),
165 EID(PERF_TYPE_SOFTWARE
, PERF_COUNT_CONTEXT_SWITCHES
),
166 EID(PERF_TYPE_SOFTWARE
, PERF_COUNT_CPU_MIGRATIONS
),
167 EID(PERF_TYPE_SOFTWARE
, PERF_COUNT_PAGE_FAULTS
),
169 EID(PERF_TYPE_HARDWARE
, PERF_COUNT_CPU_CYCLES
),
170 EID(PERF_TYPE_HARDWARE
, PERF_COUNT_INSTRUCTIONS
),
171 EID(PERF_TYPE_HARDWARE
, PERF_COUNT_CACHE_REFERENCES
),
172 EID(PERF_TYPE_HARDWARE
, PERF_COUNT_CACHE_MISSES
),
174 static int default_interval
= 100000;
175 static int event_count
[MAX_COUNTERS
];
176 static int fd
[MAX_NR_CPUS
][MAX_COUNTERS
];
178 static __u64 count_filter
= 100;
181 static int profile_cpu
= -1;
182 static int nr_cpus
= 0;
184 static int group
= 0;
185 static unsigned int page_size
;
186 static unsigned int mmap_pages
= 16;
187 static int use_mmap
= 0;
188 static int use_munmap
= 0;
190 static char *vmlinux
;
192 static char *sym_filter
;
193 static unsigned long filter_start
;
194 static unsigned long filter_end
;
196 static int delay_secs
= 2;
198 static int dump_symtab
;
204 struct source_line
*next
;
207 static struct source_line
*lines
;
208 static struct source_line
**lines_tail
;
210 const unsigned int default_count
[] = {
219 static char *hw_event_names
[] = {
229 static char *sw_event_names
[] = {
239 struct event_symbol
{
244 static struct event_symbol event_symbols
[] = {
245 {EID(PERF_TYPE_HARDWARE
, PERF_COUNT_CPU_CYCLES
), "cpu-cycles", },
246 {EID(PERF_TYPE_HARDWARE
, PERF_COUNT_CPU_CYCLES
), "cycles", },
247 {EID(PERF_TYPE_HARDWARE
, PERF_COUNT_INSTRUCTIONS
), "instructions", },
248 {EID(PERF_TYPE_HARDWARE
, PERF_COUNT_CACHE_REFERENCES
), "cache-references", },
249 {EID(PERF_TYPE_HARDWARE
, PERF_COUNT_CACHE_MISSES
), "cache-misses", },
250 {EID(PERF_TYPE_HARDWARE
, PERF_COUNT_BRANCH_INSTRUCTIONS
), "branch-instructions", },
251 {EID(PERF_TYPE_HARDWARE
, PERF_COUNT_BRANCH_INSTRUCTIONS
), "branches", },
252 {EID(PERF_TYPE_HARDWARE
, PERF_COUNT_BRANCH_MISSES
), "branch-misses", },
253 {EID(PERF_TYPE_HARDWARE
, PERF_COUNT_BUS_CYCLES
), "bus-cycles", },
255 {EID(PERF_TYPE_SOFTWARE
, PERF_COUNT_CPU_CLOCK
), "cpu-clock", },
256 {EID(PERF_TYPE_SOFTWARE
, PERF_COUNT_TASK_CLOCK
), "task-clock", },
257 {EID(PERF_TYPE_SOFTWARE
, PERF_COUNT_PAGE_FAULTS
), "page-faults", },
258 {EID(PERF_TYPE_SOFTWARE
, PERF_COUNT_PAGE_FAULTS
), "faults", },
259 {EID(PERF_TYPE_SOFTWARE
, PERF_COUNT_PAGE_FAULTS_MIN
), "minor-faults", },
260 {EID(PERF_TYPE_SOFTWARE
, PERF_COUNT_PAGE_FAULTS_MAJ
), "major-faults", },
261 {EID(PERF_TYPE_SOFTWARE
, PERF_COUNT_CONTEXT_SWITCHES
), "context-switches", },
262 {EID(PERF_TYPE_SOFTWARE
, PERF_COUNT_CONTEXT_SWITCHES
), "cs", },
263 {EID(PERF_TYPE_SOFTWARE
, PERF_COUNT_CPU_MIGRATIONS
), "cpu-migrations", },
264 {EID(PERF_TYPE_SOFTWARE
, PERF_COUNT_CPU_MIGRATIONS
), "migrations", },
267 #define __PERF_COUNTER_FIELD(config, name) \
268 ((config & PERF_COUNTER_##name##_MASK) >> PERF_COUNTER_##name##_SHIFT)
270 #define PERF_COUNTER_RAW(config) __PERF_COUNTER_FIELD(config, RAW)
271 #define PERF_COUNTER_CONFIG(config) __PERF_COUNTER_FIELD(config, CONFIG)
272 #define PERF_COUNTER_TYPE(config) __PERF_COUNTER_FIELD(config, TYPE)
273 #define PERF_COUNTER_ID(config) __PERF_COUNTER_FIELD(config, EVENT)
275 static void display_events_help(void)
281 " -e EVENT --event=EVENT # symbolic-name abbreviations");
283 for (i
= 0; i
< ARRAY_SIZE(event_symbols
); i
++) {
286 e
= event_symbols
[i
].event
;
287 type
= PERF_COUNTER_TYPE(e
);
288 id
= PERF_COUNTER_ID(e
);
290 printf("\n %d:%d: %-20s",
291 type
, id
, event_symbols
[i
].symbol
);
295 " rNNN: raw PMU events (eventsel+umask)\n\n");
298 static void display_perfstat_help(void)
301 "Usage: perfstat [<events...>] <cmd...>\n\n"
302 "PerfStat Options (up to %d event types can be specified):\n\n",
305 display_events_help();
308 " -a # system-wide collection\n");
312 static void display_help(void)
315 return display_perfstat_help();
318 "Usage: kerneltop [<options>]\n"
319 " Or: kerneltop -S [<options>] COMMAND [ARGS]\n\n"
320 "KernelTop Options (up to %d event types can be specified at once):\n\n",
323 display_events_help();
326 " -S --stat # perfstat COMMAND\n"
327 " -a # system-wide collection (for perfstat)\n\n"
328 " -c CNT --count=CNT # event period to sample\n\n"
329 " -C CPU --cpu=CPU # CPU (-1 for all) [default: -1]\n"
330 " -p PID --pid=PID # PID of sampled task (-1 for all) [default: -1]\n\n"
331 " -d delay --delay=<seconds> # sampling/display delay [default: 2]\n"
332 " -f CNT --filter=CNT # min-event-count filter [default: 100]\n\n"
333 " -s symbol --symbol=<symbol> # function to be showed annotated one-shot\n"
334 " -x path --vmlinux=<path> # the vmlinux binary, required for -s use\n"
335 " -z --zero # zero counts after display\n"
336 " -D --dump_symtab # dump symbol table to stderr on startup\n"
337 " -m pages --mmap_pages=<pages> # number of mmap data pages\n"
338 " -M --mmap_info # print mmap info stream\n"
339 " -U --munmap_info # print munmap info stream\n"
345 static char *event_name(int ctr
)
347 __u64 config
= event_id
[ctr
];
348 int type
= PERF_COUNTER_TYPE(config
);
349 int id
= PERF_COUNTER_ID(config
);
352 if (PERF_COUNTER_RAW(config
)) {
353 sprintf(buf
, "raw 0x%llx", PERF_COUNTER_CONFIG(config
));
358 case PERF_TYPE_HARDWARE
:
359 if (id
< PERF_HW_EVENTS_MAX
)
360 return hw_event_names
[id
];
361 return "unknown-hardware";
363 case PERF_TYPE_SOFTWARE
:
364 if (id
< PERF_SW_EVENTS_MAX
)
365 return sw_event_names
[id
];
366 return "unknown-software";
376 * Each event can have multiple symbolic names.
377 * Symbolic names are (almost) exactly matched.
379 static __u64
match_event_symbols(char *str
)
385 if (sscanf(str
, "r%llx", &config
) == 1)
386 return config
| PERF_COUNTER_RAW_MASK
;
388 if (sscanf(str
, "%d:%llu", &type
, &id
) == 2)
389 return EID(type
, id
);
391 for (i
= 0; i
< ARRAY_SIZE(event_symbols
); i
++) {
392 if (!strncmp(str
, event_symbols
[i
].symbol
,
393 strlen(event_symbols
[i
].symbol
)))
394 return event_symbols
[i
].event
;
400 static int parse_events(char *str
)
405 if (nr_counters
== MAX_COUNTERS
)
408 config
= match_event_symbols(str
);
412 event_id
[nr_counters
] = config
;
415 str
= strstr(str
, ",");
429 char fault_here
[1000000];
431 static void create_perfstat_counter(int counter
)
433 struct perf_counter_hw_event hw_event
;
435 memset(&hw_event
, 0, sizeof(hw_event
));
436 hw_event
.config
= event_id
[counter
];
437 hw_event
.record_type
= PERF_RECORD_SIMPLE
;
442 for (cpu
= 0; cpu
< nr_cpus
; cpu
++) {
443 fd
[cpu
][counter
] = sys_perf_counter_open(&hw_event
, -1, cpu
, -1, 0);
444 if (fd
[cpu
][counter
] < 0) {
445 printf("perfstat error: syscall returned with %d (%s)\n",
446 fd
[cpu
][counter
], strerror(errno
));
451 hw_event
.inherit
= 1;
452 hw_event
.disabled
= 1;
454 fd
[0][counter
] = sys_perf_counter_open(&hw_event
, 0, -1, -1, 0);
455 if (fd
[0][counter
] < 0) {
456 printf("perfstat error: syscall returned with %d (%s)\n",
457 fd
[0][counter
], strerror(errno
));
463 int do_perfstat(int argc
, char *argv
[])
465 unsigned long long t0
, t1
;
474 for (counter
= 0; counter
< nr_counters
; counter
++)
475 create_perfstat_counter(counter
);
484 * Enable counters and exec the command:
487 prctl(PR_TASK_PERF_COUNTERS_ENABLE
);
489 if ((pid
= fork()) < 0)
490 perror("failed to fork");
492 if (execvp(argv
[0], argv
)) {
497 while (wait(&status
) >= 0)
499 prctl(PR_TASK_PERF_COUNTERS_DISABLE
);
504 fprintf(stderr
, "\n");
505 fprintf(stderr
, " Performance counter stats for \'%s\':\n",
507 fprintf(stderr
, "\n");
509 for (counter
= 0; counter
< nr_counters
; counter
++) {
511 __u64 count
, single_count
;
514 for (cpu
= 0; cpu
< nr_cpus
; cpu
++) {
515 res
= read(fd
[cpu
][counter
],
516 (char *) &single_count
, sizeof(single_count
));
517 assert(res
== sizeof(single_count
));
518 count
+= single_count
;
521 if (event_id
[counter
] == EID(PERF_TYPE_SOFTWARE
, PERF_COUNT_CPU_CLOCK
) ||
522 event_id
[counter
] == EID(PERF_TYPE_SOFTWARE
, PERF_COUNT_TASK_CLOCK
)) {
524 double msecs
= (double)count
/ 1000000;
526 fprintf(stderr
, " %14.6f %-20s (msecs)\n",
527 msecs
, event_name(counter
));
529 fprintf(stderr
, " %14Ld %-20s (events)\n",
530 count
, event_name(counter
));
533 fprintf(stderr
, "\n");
534 fprintf(stderr
, " Wall-clock time elapsed: %12.6f msecs\n",
535 (double)(t1
-t0
)/1e6
);
536 fprintf(stderr
, "\n");
545 static uint64_t min_ip
;
546 static uint64_t max_ip
= -1ll;
549 unsigned long long addr
;
551 unsigned long count
[MAX_COUNTERS
];
553 struct source_line
*source
;
556 #define MAX_SYMS 100000
558 static int sym_table_count
;
560 struct sym_entry
*sym_filter_entry
;
562 static struct sym_entry sym_table
[MAX_SYMS
];
564 static void show_details(struct sym_entry
*sym
);
567 * Ordering weight: count-1 * count-2 * ... / count-n
569 static double sym_weight(const struct sym_entry
*sym
)
574 weight
= sym
->count
[0];
576 for (counter
= 1; counter
< nr_counters
-1; counter
++)
577 weight
*= sym
->count
[counter
];
579 weight
/= (sym
->count
[counter
] + 1);
584 static int compare(const void *__sym1
, const void *__sym2
)
586 const struct sym_entry
*sym1
= __sym1
, *sym2
= __sym2
;
588 return sym_weight(sym1
) < sym_weight(sym2
);
591 static time_t last_refresh
;
593 static long userspace_events
;
594 static const char CONSOLE_CLEAR
[] = "\e[H\e[2J";
596 static struct sym_entry tmp
[MAX_SYMS
];
598 static void print_sym_table(void)
602 float events_per_sec
= events
/delay_secs
;
603 float kevents_per_sec
= (events
-userspace_events
)/delay_secs
;
605 memcpy(tmp
, sym_table
, sizeof(sym_table
[0])*sym_table_count
);
606 qsort(tmp
, sym_table_count
, sizeof(tmp
[0]), compare
);
608 write(1, CONSOLE_CLEAR
, strlen(CONSOLE_CLEAR
));
611 "------------------------------------------------------------------------------\n");
612 printf( " KernelTop:%8.0f irqs/sec kernel:%3.1f%% [%s, ",
614 100.0 - (100.0*((events_per_sec
-kevents_per_sec
)/events_per_sec
)),
615 nmi
? "NMI" : "IRQ");
617 if (nr_counters
== 1)
618 printf("%d ", event_count
[0]);
620 for (counter
= 0; counter
< nr_counters
; counter
++) {
624 printf("%s", event_name(counter
));
630 printf(" (tid: %d", tid
);
634 if (profile_cpu
!= -1)
635 printf(", cpu: %d)\n", profile_cpu
);
640 printf(", %d CPUs)\n", nr_cpus
);
643 printf("------------------------------------------------------------------------------\n\n");
645 if (nr_counters
== 1)
648 printf(" weight events");
650 printf(" RIP kernel function\n"
651 " ______ ______ ________________ _______________\n\n"
655 for (i
= 0; i
< sym_table_count
; i
++) {
658 if (nr_counters
== 1) {
660 tmp
[i
].count
[0] >= count_filter
) {
661 printf("%19.2f - %016llx : %s\n",
662 sym_weight(tmp
+ i
), tmp
[i
].addr
, tmp
[i
].sym
);
667 tmp
[i
].count
[0] >= count_filter
) {
668 printf("%8.1f %10ld - %016llx : %s\n",
671 tmp
[i
].addr
, tmp
[i
].sym
);
676 * Add decay to the counts:
678 for (count
= 0; count
< nr_counters
; count
++)
679 sym_table
[i
].count
[count
] = zero
? 0 : sym_table
[i
].count
[count
] * 7 / 8;
682 if (sym_filter_entry
)
683 show_details(sym_filter_entry
);
685 last_refresh
= time(NULL
);
688 struct pollfd stdin_poll
= { .fd
= 0, .events
= POLLIN
};
690 if (poll(&stdin_poll
, 1, 0) == 1) {
691 printf("key pressed - exiting.\n");
697 static int read_symbol(FILE *in
, struct sym_entry
*s
)
699 static int filter_match
= 0;
704 rc
= fscanf(in
, "%llx %c %499s", &s
->addr
, &stype
, str
);
710 /* skip until end of line: */
714 if (rc
== '\n' || rc
== EOF
|| pos
>= 499)
723 /* Filter out known duplicates and non-text symbols. */
724 if (!strcmp(sym
, "_text"))
726 if (!min_ip
&& !strcmp(sym
, "_stext"))
728 if (!strcmp(sym
, "_etext") || !strcmp(sym
, "_sinittext"))
730 if (stype
!= 'T' && stype
!= 't')
732 if (!strncmp("init_module", sym
, 11) || !strncmp("cleanup_module", sym
, 14))
734 if (strstr(sym
, "_text_start") || strstr(sym
, "_text_end"))
737 s
->sym
= malloc(strlen(str
));
740 strcpy((char *)s
->sym
, str
);
743 /* Tag events to be skipped. */
744 if (!strcmp("default_idle", s
->sym
) || !strcmp("cpu_idle", s
->sym
))
746 else if (!strcmp("enter_idle", s
->sym
) || !strcmp("exit_idle", s
->sym
))
748 else if (!strcmp("mwait_idle", s
->sym
))
751 if (filter_match
== 1) {
752 filter_end
= s
->addr
;
754 if (filter_end
- filter_start
> 10000) {
755 printf("hm, too large filter symbol <%s> - skipping.\n",
757 printf("symbol filter start: %016lx\n", filter_start
);
758 printf(" end: %016lx\n", filter_end
);
759 filter_end
= filter_start
= 0;
764 if (filter_match
== 0 && sym_filter
&& !strcmp(s
->sym
, sym_filter
)) {
766 filter_start
= s
->addr
;
772 int compare_addr(const void *__sym1
, const void *__sym2
)
774 const struct sym_entry
*sym1
= __sym1
, *sym2
= __sym2
;
776 return sym1
->addr
> sym2
->addr
;
779 static void sort_symbol_table(void)
784 qsort(sym_table
, sym_table_count
, sizeof(sym_table
[0]), compare_addr
);
785 for (i
= 0, dups
= 0; i
< sym_table_count
; i
++) {
786 if (sym_table
[i
].addr
== sym_table
[i
+1].addr
) {
787 sym_table
[i
+1].addr
= -1ll;
791 sym_table_count
-= dups
;
795 static void parse_symbols(void)
797 struct sym_entry
*last
;
799 FILE *kallsyms
= fopen("/proc/kallsyms", "r");
802 printf("Could not open /proc/kallsyms - no CONFIG_KALLSYMS_ALL=y?\n");
806 while (!feof(kallsyms
)) {
807 if (read_symbol(kallsyms
, &sym_table
[sym_table_count
]) == 0) {
809 assert(sym_table_count
<= MAX_SYMS
);
814 min_ip
= sym_table
[0].addr
;
815 max_ip
= sym_table
[sym_table_count
-1].addr
;
816 last
= sym_table
+ sym_table_count
++;
823 for (count
=0; count
< sym_table_count
; count
++) {
824 if (!strcmp(sym_table
[count
].sym
, sym_filter
)) {
825 sym_filter_entry
= &sym_table
[count
];
833 for (i
= 0; i
< sym_table_count
; i
++)
834 fprintf(stderr
, "%llx %s\n",
835 sym_table
[i
].addr
, sym_table
[i
].sym
);
843 static void parse_vmlinux(char *filename
)
846 char command
[PATH_MAX
*2];
850 sprintf(command
, "objdump --start-address=0x%016lx --stop-address=0x%016lx -dS %s", filter_start
, filter_end
, filename
);
852 file
= popen(command
, "r");
857 while (!feof(file
)) {
858 struct source_line
*src
;
862 src
= malloc(sizeof(struct source_line
));
864 memset(src
, 0, sizeof(struct source_line
));
866 if (getline(&src
->line
, &dummy
, file
) < 0)
871 c
= strchr(src
->line
, '\n');
877 lines_tail
= &src
->next
;
879 if (strlen(src
->line
)>8 && src
->line
[8] == ':')
880 src
->EIP
= strtoull(src
->line
, NULL
, 16);
881 if (strlen(src
->line
)>8 && src
->line
[16] == ':')
882 src
->EIP
= strtoull(src
->line
, NULL
, 16);
887 static void record_precise_ip(uint64_t ip
)
889 struct source_line
*line
;
891 for (line
= lines
; line
; line
= line
->next
) {
899 static void lookup_sym_in_vmlinux(struct sym_entry
*sym
)
901 struct source_line
*line
;
902 char pattern
[PATH_MAX
];
903 sprintf(pattern
, "<%s>:", sym
->sym
);
905 for (line
= lines
; line
; line
= line
->next
) {
906 if (strstr(line
->line
, pattern
)) {
913 static void show_lines(struct source_line
*line_queue
, int line_queue_count
)
916 struct source_line
*line
;
919 for (i
= 0; i
< line_queue_count
; i
++) {
920 printf("%8li\t%s\n", line
->count
, line
->line
);
925 #define TRACE_COUNT 3
927 static void show_details(struct sym_entry
*sym
)
929 struct source_line
*line
;
930 struct source_line
*line_queue
= NULL
;
932 int line_queue_count
= 0;
935 lookup_sym_in_vmlinux(sym
);
939 printf("Showing details for %s\n", sym
->sym
);
943 if (displayed
&& strstr(line
->line
, ">:"))
946 if (!line_queue_count
)
950 if (line
->count
>= count_filter
) {
951 show_lines(line_queue
, line_queue_count
);
952 line_queue_count
= 0;
954 } else if (line_queue_count
> TRACE_COUNT
) {
955 line_queue
= line_queue
->next
;
968 * Binary search in the histogram table and record the hit:
970 static void record_ip(uint64_t ip
, int counter
)
972 int left_idx
, middle_idx
, right_idx
, idx
;
973 unsigned long left
, middle
, right
;
975 record_precise_ip(ip
);
978 right_idx
= sym_table_count
-1;
979 assert(ip
<= max_ip
&& ip
>= min_ip
);
981 while (left_idx
+ 1 < right_idx
) {
982 middle_idx
= (left_idx
+ right_idx
) / 2;
984 left
= sym_table
[ left_idx
].addr
;
985 middle
= sym_table
[middle_idx
].addr
;
986 right
= sym_table
[ right_idx
].addr
;
988 if (!(left
<= middle
&& middle
<= right
)) {
989 printf("%016lx...\n%016lx...\n%016lx\n", left
, middle
, right
);
990 printf("%d %d %d\n", left_idx
, middle_idx
, right_idx
);
992 assert(left
<= middle
&& middle
<= right
);
993 if (!(left
<= ip
&& ip
<= right
)) {
994 printf(" left: %016lx\n", left
);
995 printf(" ip: %016lx\n", (unsigned long)ip
);
996 printf("right: %016lx\n", right
);
998 assert(left
<= ip
&& ip
<= right
);
1000 * [ left .... target .... middle .... right ]
1001 * => right := middle
1004 right_idx
= middle_idx
;
1008 * [ left .... middle ... target ... right ]
1011 left_idx
= middle_idx
;
1016 if (!sym_table
[idx
].skip
)
1017 sym_table
[idx
].count
[counter
]++;
1021 static void process_event(uint64_t ip
, int counter
)
1025 if (ip
< min_ip
|| ip
> max_ip
) {
1030 record_ip(ip
, counter
);
1033 static void process_options(int argc
, char *argv
[])
1035 int error
= 0, counter
;
1037 if (strstr(argv
[0], "perfstat"))
1041 int option_index
= 0;
1042 /** Options for getopt */
1043 static struct option long_options
[] = {
1044 {"count", required_argument
, NULL
, 'c'},
1045 {"cpu", required_argument
, NULL
, 'C'},
1046 {"delay", required_argument
, NULL
, 'd'},
1047 {"dump_symtab", no_argument
, NULL
, 'D'},
1048 {"event", required_argument
, NULL
, 'e'},
1049 {"filter", required_argument
, NULL
, 'f'},
1050 {"group", required_argument
, NULL
, 'g'},
1051 {"help", no_argument
, NULL
, 'h'},
1052 {"nmi", required_argument
, NULL
, 'n'},
1053 {"pid", required_argument
, NULL
, 'p'},
1054 {"vmlinux", required_argument
, NULL
, 'x'},
1055 {"symbol", required_argument
, NULL
, 's'},
1056 {"stat", no_argument
, NULL
, 'S'},
1057 {"zero", no_argument
, NULL
, 'z'},
1058 {"mmap_pages", required_argument
, NULL
, 'm'},
1059 {"mmap_info", no_argument
, NULL
, 'M'},
1060 {"munmap_info", no_argument
, NULL
, 'U'},
1063 int c
= getopt_long(argc
, argv
, "+:ac:C:d:De:f:g:hn:m:p:s:Sx:zMU",
1064 long_options
, &option_index
);
1069 case 'a': system_wide
= 1; break;
1070 case 'c': default_interval
= atoi(optarg
); break;
1072 /* CPU and PID are mutually exclusive */
1074 printf("WARNING: CPU switch overriding PID\n");
1078 profile_cpu
= atoi(optarg
); break;
1079 case 'd': delay_secs
= atoi(optarg
); break;
1080 case 'D': dump_symtab
= 1; break;
1082 case 'e': error
= parse_events(optarg
); break;
1084 case 'f': count_filter
= atoi(optarg
); break;
1085 case 'g': group
= atoi(optarg
); break;
1086 case 'h': display_help(); break;
1087 case 'n': nmi
= atoi(optarg
); break;
1089 /* CPU and PID are mutually exclusive */
1090 if (profile_cpu
!= -1) {
1091 printf("WARNING: PID switch overriding CPU\n");
1095 tid
= atoi(optarg
); break;
1096 case 's': sym_filter
= strdup(optarg
); break;
1097 case 'S': run_perfstat
= 1; break;
1098 case 'x': vmlinux
= strdup(optarg
); break;
1099 case 'z': zero
= 1; break;
1100 case 'm': mmap_pages
= atoi(optarg
); break;
1101 case 'M': use_mmap
= 1; break;
1102 case 'U': use_munmap
= 1; break;
1103 default: error
= 1; break;
1118 for (counter
= 0; counter
< nr_counters
; counter
++) {
1119 if (event_count
[counter
])
1122 event_count
[counter
] = default_interval
;
1133 static unsigned int mmap_read_head(struct mmap_data
*md
)
1135 struct perf_counter_mmap_page
*pc
= md
->base
;
1138 head
= pc
->data_head
;
1144 struct timeval last_read
, this_read
;
1146 static void mmap_read(struct mmap_data
*md
)
1148 unsigned int head
= mmap_read_head(md
);
1149 unsigned int old
= md
->prev
;
1150 unsigned char *data
= md
->base
+ page_size
;
1153 gettimeofday(&this_read
, NULL
);
1156 * If we're further behind than half the buffer, there's a chance
1157 * the writer will bite our tail and screw up the events under us.
1159 * If we somehow ended up ahead of the head, we got messed up.
1161 * In either case, truncate and restart at head.
1164 if (diff
> md
->mask
/ 2 || diff
< 0) {
1166 unsigned long msecs
;
1168 timersub(&this_read
, &last_read
, &iv
);
1169 msecs
= iv
.tv_sec
*1000 + iv
.tv_usec
/1000;
1171 fprintf(stderr
, "WARNING: failed to keep up with mmap data."
1172 " Last read %lu msecs ago.\n", msecs
);
1175 * head points to a known good entry, start there.
1180 last_read
= this_read
;
1182 for (; old
!= head
;) {
1184 struct perf_event_header header
;
1189 struct perf_event_header header
;
1194 char filename
[PATH_MAX
];
1197 typedef union event_union
{
1198 struct perf_event_header header
;
1200 struct mmap_event mmap
;
1203 event_t
*event
= (event_t
*)&data
[old
& md
->mask
];
1207 unsigned int size
= event
->header
.size
;
1210 * Event straddles the mmap boundary -- header should always
1211 * be inside due to u64 alignment of output.
1213 if ((old
& md
->mask
) + size
!= ((old
+ size
) & md
->mask
)) {
1214 unsigned int offset
= old
;
1215 unsigned int len
= min(sizeof(*event
), size
), cpy
;
1216 void *dst
= &event_copy
;
1219 cpy
= min(md
->mask
+ 1 - (offset
& md
->mask
), len
);
1220 memcpy(dst
, &data
[offset
& md
->mask
], cpy
);
1226 event
= &event_copy
;
1231 switch (event
->header
.type
) {
1233 case PERF_EVENT_IP
| __PERF_EVENT_TID
:
1234 process_event(event
->ip
.ip
, md
->counter
);
1237 case PERF_EVENT_MMAP
:
1238 case PERF_EVENT_MUNMAP
:
1239 printf("%s: %Lu %Lu %Lu %s\n",
1240 event
->header
.type
== PERF_EVENT_MMAP
1241 ? "mmap" : "munmap",
1245 event
->mmap
.filename
);
1253 int main(int argc
, char *argv
[])
1255 struct pollfd event_array
[MAX_NR_CPUS
* MAX_COUNTERS
];
1256 struct mmap_data mmap_array
[MAX_NR_CPUS
][MAX_COUNTERS
];
1257 struct perf_counter_hw_event hw_event
;
1258 int i
, counter
, group_fd
, nr_poll
= 0;
1262 page_size
= sysconf(_SC_PAGE_SIZE
);
1264 process_options(argc
, argv
);
1266 nr_cpus
= sysconf(_SC_NPROCESSORS_ONLN
);
1267 assert(nr_cpus
<= MAX_NR_CPUS
);
1268 assert(nr_cpus
>= 0);
1271 return do_perfstat(argc
, argv
);
1273 if (tid
!= -1 || profile_cpu
!= -1)
1277 if (vmlinux
&& sym_filter_entry
)
1278 parse_vmlinux(vmlinux
);
1280 for (i
= 0; i
< nr_cpus
; i
++) {
1282 for (counter
= 0; counter
< nr_counters
; counter
++) {
1285 if (tid
== -1 && profile_cpu
== -1)
1288 memset(&hw_event
, 0, sizeof(hw_event
));
1289 hw_event
.config
= event_id
[counter
];
1290 hw_event
.irq_period
= event_count
[counter
];
1291 hw_event
.record_type
= PERF_RECORD_IRQ
;
1293 hw_event
.include_tid
= 1;
1294 hw_event
.mmap
= use_mmap
;
1295 hw_event
.munmap
= use_munmap
;
1297 fd
[i
][counter
] = sys_perf_counter_open(&hw_event
, tid
, cpu
, group_fd
, 0);
1298 if (fd
[i
][counter
] < 0) {
1300 printf("kerneltop error: syscall returned with %d (%s)\n",
1301 fd
[i
][counter
], strerror(err
));
1303 printf("Are you root?\n");
1306 assert(fd
[i
][counter
] >= 0);
1307 fcntl(fd
[i
][counter
], F_SETFL
, O_NONBLOCK
);
1310 * First counter acts as the group leader:
1312 if (group
&& group_fd
== -1)
1313 group_fd
= fd
[i
][counter
];
1315 event_array
[nr_poll
].fd
= fd
[i
][counter
];
1316 event_array
[nr_poll
].events
= POLLIN
;
1319 mmap_array
[i
][counter
].counter
= counter
;
1320 mmap_array
[i
][counter
].prev
= 0;
1321 mmap_array
[i
][counter
].mask
= mmap_pages
*page_size
- 1;
1322 mmap_array
[i
][counter
].base
= mmap(NULL
, (mmap_pages
+1)*page_size
,
1323 PROT_READ
, MAP_SHARED
, fd
[i
][counter
], 0);
1324 if (mmap_array
[i
][counter
].base
== MAP_FAILED
) {
1325 printf("kerneltop error: failed to mmap with %d (%s)\n",
1326 errno
, strerror(errno
));
1332 printf("KernelTop refresh period: %d seconds\n", delay_secs
);
1333 last_refresh
= time(NULL
);
1338 for (i
= 0; i
< nr_cpus
; i
++) {
1339 for (counter
= 0; counter
< nr_counters
; counter
++)
1340 mmap_read(&mmap_array
[i
][counter
]);
1343 if (time(NULL
) >= last_refresh
+ delay_secs
) {
1345 events
= userspace_events
= 0;
1349 ret
= poll(event_array
, nr_poll
, 1000);