1 /* $NetBSD: main.c,v 1.13 2008/04/28 15:36:01 ad Exp $ */
4 * Copyright (c) 2006, 2007 The NetBSD Foundation, Inc.
7 * This code is derived from software contributed to The NetBSD Foundation
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
35 * - Tracking of times for sleep locks is broken.
36 * - Need better analysis and tracking of events.
37 * - Shouldn't have to parse the namelist here. We should use something like
39 * - The way the namelist is searched sucks, is it worth doing something
43 #include <sys/cdefs.h>
45 __RCSID("$NetBSD: main.c,v 1.13 2008/04/28 15:36:01 ad Exp $");
48 #include <sys/types.h>
49 #include <sys/param.h>
51 #include <sys/fcntl.h>
52 #include <sys/ioctl.h>
54 #include <sys/signal.h>
55 #include <sys/sysctl.h>
57 #include <dev/lockstat.h>
73 #define _PATH_DEV_LOCKSTAT "/dev/lockstat"
76 #define MICRO 1000000.0
77 #define NANO 1000000000.0
78 #define PICO 1000000000000.0
80 TAILQ_HEAD(lock_head
, lockstruct
);
81 typedef struct lock_head locklist_t
;
82 TAILQ_HEAD(buf_head
, lsbuf
);
83 typedef struct buf_head buflist_t
;
85 typedef struct lockstruct
{
86 TAILQ_ENTRY(lockstruct
) chain
;
102 const name_t locknames
[] = {
103 { "adaptive_mutex", LB_ADAPTIVE_MUTEX
},
104 { "spin_mutex", LB_SPIN_MUTEX
},
105 { "rwlock", LB_RWLOCK
},
106 { "kernel_lock", LB_KERNEL_LOCK
},
107 { "preemption", LB_NOPREEMPT
},
111 const name_t eventnames
[] = {
113 { "sleep_exclusive", LB_SLEEP1
},
114 { "sleep_shared", LB_SLEEP2
},
118 const name_t alltypes
[] = {
119 { "Adaptive mutex spin", LB_ADAPTIVE_MUTEX
| LB_SPIN
},
120 { "Adaptive mutex sleep", LB_ADAPTIVE_MUTEX
| LB_SLEEP1
},
121 { "Spin mutex spin", LB_SPIN_MUTEX
| LB_SPIN
},
122 { "RW lock sleep (writer)", LB_RWLOCK
| LB_SLEEP1
},
123 { "RW lock sleep (reader)", LB_RWLOCK
| LB_SLEEP2
},
124 { "RW lock spin", LB_RWLOCK
| LB_SPIN
},
125 { "Kernel lock spin", LB_KERNEL_LOCK
| LB_SPIN
},
126 { "Kernel preemption defer", LB_NOPREEMPT
| LB_SPIN
},
145 double cpuscale
[sizeof(ld
.ld_freq
) / sizeof(ld
.ld_freq
[0])];
148 void findsym(findsym_t
, char *, uintptr_t *, uintptr_t *, bool);
149 void spawn(int, char **);
150 void display(int, const char *name
);
151 void listnames(const name_t
*);
152 void collapse(bool, bool);
153 int matchname(const name_t
*, char *);
154 void makelists(int, int);
158 lock_t
*morelocks(void);
161 main(int argc
, char **argv
)
163 int eventtype
, locktype
, ch
, nlfd
, fd
, i
;
164 bool sflag
, pflag
, mflag
, Mflag
;
165 const char *nlistf
, *outf
;
166 char *lockname
, *funcname
;
184 while ((ch
= getopt(argc
, argv
, "E:F:L:MN:T:b:ceflmo:pst")) != -1)
187 eventtype
= matchname(eventnames
, optarg
);
199 locktype
= matchname(locknames
, optarg
);
202 nbufs
= (int)strtol(optarg
, &p
, 0);
203 if (!isdigit((u_int
)*optarg
) || *p
!= '\0')
210 listnames(eventnames
);
234 listnames(locknames
);
246 fd
= open(outf
, O_WRONLY
| O_CREAT
| O_TRUNC
, 0600);
248 err(EXIT_FAILURE
, "opening %s", outf
);
249 outfp
= fdopen(fd
, "w");
254 * Find the name list for resolving symbol names, and load it into
257 if (nlistf
== NULL
) {
258 nlfd
= open(_PATH_KSYMS
, O_RDONLY
);
259 nlistf
= getbootfile();
263 if ((nlfd
= open(nlistf
, O_RDONLY
)) < 0)
264 err(EXIT_FAILURE
, "cannot open " _PATH_KSYMS
" or %s",
267 if (loadsym32(nlfd
) != 0) {
268 if (loadsym64(nlfd
) != 0)
269 errx(EXIT_FAILURE
, "unable to load symbol table");
274 memset(&le
, 0, sizeof(le
));
278 * Set up initial filtering.
280 if (lockname
!= NULL
) {
281 findsym(LOCK_BYNAME
, lockname
, &le
.le_lockstart
,
282 &le
.le_lockend
, true);
283 le
.le_flags
|= LE_ONE_LOCK
;
286 le
.le_flags
|= LE_CALLSITE
;
288 le
.le_flags
|= LE_LOCK
;
289 if (funcname
!= NULL
) {
292 findsym(FUNC_BYNAME
, funcname
, &le
.le_csstart
, &le
.le_csend
, true);
293 le
.le_flags
|= LE_ONE_CALLSITE
;
295 le
.le_mask
= (eventtype
& LB_EVENT_MASK
) | (locktype
& LB_LOCK_MASK
);
300 if ((lsfd
= open(_PATH_DEV_LOCKSTAT
, O_RDONLY
)) < 0)
301 err(EXIT_FAILURE
, "cannot open " _PATH_DEV_LOCKSTAT
);
302 if (ioctl(lsfd
, IOC_LOCKSTAT_GVERSION
, &ch
) < 0)
303 err(EXIT_FAILURE
, "ioctl");
304 if (ch
!= LS_VERSION
)
306 "incompatible lockstat interface version (%d, kernel %d)",
308 if (ioctl(lsfd
, IOC_LOCKSTAT_ENABLE
, &le
))
309 err(EXIT_FAILURE
, "cannot enable tracing");
312 * Execute the traced program.
317 * Stop tracing, and read the trace buffers from the kernel.
319 if (ioctl(lsfd
, IOC_LOCKSTAT_DISABLE
, &ld
) == -1) {
320 if (errno
== EOVERFLOW
) {
321 warnx("overflowed available kernel trace buffers");
324 err(EXIT_FAILURE
, "cannot disable tracing");
326 if ((bufs
= malloc(ld
.ld_size
)) == NULL
)
327 err(EXIT_FAILURE
, "cannot allocate memory for user buffers");
328 if (read(lsfd
, bufs
, ld
.ld_size
) != ld
.ld_size
)
329 err(EXIT_FAILURE
, "reading from " _PATH_DEV_LOCKSTAT
);
331 err(EXIT_FAILURE
, "close(" _PATH_DEV_LOCKSTAT
")");
334 * Figure out how to scale the results. For internal use we convert
335 * all times from CPU frequency based to picoseconds, and values are
336 * eventually displayed in ms.
338 for (i
= 0; i
< sizeof(ld
.ld_freq
) / sizeof(ld
.ld_freq
[0]); i
++)
339 if (ld
.ld_freq
[i
] != 0)
340 cpuscale
[i
] = PICO
/ ld
.ld_freq
[i
];
341 ms
= ld
.ld_time
.tv_sec
* MILLI
+ ld
.ld_time
.tv_nsec
/ MICRO
;
343 cscale
= 1.0 / ncpu();
346 cscale
*= (sflag
? MILLI
/ ms
: 1.0);
347 tscale
= cscale
/ NANO
;
348 nbufs
= (int)(ld
.ld_size
/ sizeof(lsbuf_t
));
350 TAILQ_INIT(&locklist
);
351 TAILQ_INIT(&sortlist
);
352 TAILQ_INIT(&freelist
);
354 if ((mflag
| Mflag
) != 0)
355 collapse(mflag
, Mflag
);
358 * Display the results.
360 fprintf(outfp
, "Elapsed time: %.2f seconds.", ms
/ MILLI
);
361 if (sflag
|| pflag
) {
362 fprintf(outfp
, " Displaying ");
364 fprintf(outfp
, "per-CPU ");
366 fprintf(outfp
, "per-second ");
367 fprintf(outfp
, "averages.");
371 for (name
= alltypes
; name
->name
!= NULL
; name
++) {
372 if (eventtype
!= -1 &&
373 (name
->mask
& LB_EVENT_MASK
) != eventtype
)
375 if (locktype
!= -1 &&
376 (name
->mask
& LB_LOCK_MASK
) != locktype
)
379 display(name
->mask
, name
->name
);
383 fprintf(outfp
, "None of the selected events were recorded.\n");
393 "%s [options] <command>\n\n"
394 "-b nbuf\t\tset number of event buffers to allocate\n"
395 "-c\t\treport percentage of total events by count, not time\n"
396 "-E event\t\tdisplay only one type of event\n"
397 "-e\t\tlist event types\n"
398 "-F func\t\tlimit trace to one function\n"
399 "-f\t\ttrace only by function\n"
400 "-L lock\t\tlimit trace to one lock (name, or address)\n"
401 "-l\t\ttrace only by lock\n"
402 "-M\t\tmerge lock addresses within unique objects\n"
403 "-m\t\tmerge call sites within unique functions\n"
404 "-N nlist\tspecify name list file\n"
405 "-o file\t\tsend output to named file, not stdout\n"
406 "-p\t\tshow average count/time per CPU, not total\n"
407 "-s\t\tshow average count/time per second, not total\n"
408 "-T type\t\tdisplay only one type of lock\n"
409 "-t\t\tlist lock types\n",
410 getprogname(), getprogname());
423 listnames(const name_t
*name
)
426 for (; name
->name
!= NULL
; name
++)
427 printf("%s\n", name
->name
);
433 matchname(const name_t
*name
, char *string
)
441 while ((sp
= strsep(&string
, ",")) != NULL
) {
445 for (; name
->name
!= NULL
; name
++) {
446 if (strcasecmp(name
->name
, sp
) == 0) {
451 if (name
->name
== NULL
)
452 errx(EXIT_FAILURE
, "unknown identifier `%s'", sp
);
463 * Return the number of CPUs in the running system.
474 if (sysctl(mib
, 2, &rv
, &varlen
, NULL
, (size_t)0) < 0)
481 * Call into the ELF parser and look up a symbol by name or by address.
484 findsym(findsym_t find
, char *name
, uintptr_t *start
, uintptr_t *end
, bool chg
)
486 uintptr_t tend
, sa
, ea
;
499 if (find
== LOCK_BYNAME
) {
500 if (isdigit((u_int
)name
[0])) {
501 *start
= (uintptr_t)strtoul(name
, &p
, 0);
508 rv
= findsym64(find
, name
, start
, end
);
510 rv
= findsym32(find
, name
, start
, end
);
512 if (find
== FUNC_BYNAME
|| find
== LOCK_BYNAME
) {
514 errx(EXIT_FAILURE
, "unable to find symbol `%s'", name
);
519 snprintf(name
, NAME_SIZE
, "%016lx", (long)*start
);
523 * Fork off the child process and wait for it to complete. We trap SIGINT
524 * so that the caller can use Ctrl-C to stop tracing early and still get
528 spawn(int argc
, char **argv
)
532 switch (pid
= fork()) {
535 if (execvp(argv
[0], argv
) == -1)
536 err(EXIT_FAILURE
, "cannot exec");
539 err(EXIT_FAILURE
, "cannot fork to exec");
542 signal(SIGINT
, nullsig
);
544 signal(SIGINT
, SIG_DFL
);
550 * Allocate a new block of lock_t structures.
555 const static int batch
= 32;
556 lock_t
*l
, *lp
, *max
;
558 l
= (lock_t
*)malloc(sizeof(*l
) * batch
);
560 for (lp
= l
, max
= l
+ batch
; lp
< max
; lp
++)
561 TAILQ_INSERT_TAIL(&freelist
, lp
, chain
);
567 * Collapse addresses from unique objects.
570 collapse(bool func
, bool lock
)
574 for (lb
= bufs
, max
= bufs
+ nbufs
; lb
< max
; lb
++) {
575 if (func
&& lb
->lb_callsite
!= 0) {
576 findsym(FUNC_BYADDR
, NULL
, &lb
->lb_callsite
, NULL
,
579 if (lock
&& lb
->lb_lock
!= 0) {
580 findsym(LOCK_BYADDR
, NULL
, &lb
->lb_lock
, NULL
,
587 * From the kernel supplied data, construct two dimensional lists of locks
588 * and event buffers, indexed by lock type and sorted by event type.
591 makelists(int mask
, int event
)
593 lsbuf_t
*lb
, *lb2
, *max
;
598 * Recycle lock_t structures from the last run.
600 while ((l
= TAILQ_FIRST(&locklist
)) != NULL
) {
601 TAILQ_REMOVE(&locklist
, l
, chain
);
602 TAILQ_INSERT_HEAD(&freelist
, l
, chain
);
605 type
= mask
& LB_LOCK_MASK
;
607 for (lb
= bufs
, max
= bufs
+ nbufs
; lb
< max
; lb
++) {
608 if ((lb
->lb_flags
& LB_LOCK_MASK
) != type
||
609 lb
->lb_counts
[event
] == 0)
613 * Look for a record descibing this lock, and allocate a
616 TAILQ_FOREACH(l
, &sortlist
, chain
) {
617 if (l
->lock
== lb
->lb_lock
)
621 if ((l
= TAILQ_FIRST(&freelist
)) == NULL
)
623 TAILQ_REMOVE(&freelist
, l
, chain
);
624 l
->flags
= lb
->lb_flags
;
625 l
->lock
= lb
->lb_lock
;
630 TAILQ_INIT(&l
->tosort
);
631 TAILQ_INIT(&l
->bufs
);
632 TAILQ_INSERT_TAIL(&sortlist
, l
, chain
);
636 * Scale the time values per buffer and summarise
637 * times+counts per lock.
639 lb
->lb_times
[event
] *= cpuscale
[lb
->lb_cpu
];
640 l
->count
+= lb
->lb_counts
[event
];
641 l
->time
+= lb
->lb_times
[event
];
644 * Merge same lock+callsite pairs from multiple CPUs
647 TAILQ_FOREACH(lb2
, &l
->tosort
, lb_chain
.tailq
) {
648 if (lb
->lb_callsite
== lb2
->lb_callsite
)
652 lb2
->lb_counts
[event
] += lb
->lb_counts
[event
];
653 lb2
->lb_times
[event
] += lb
->lb_times
[event
];
655 TAILQ_INSERT_HEAD(&l
->tosort
, lb
, lb_chain
.tailq
);
661 * Now sort the lists.
663 while ((l
= TAILQ_FIRST(&sortlist
)) != NULL
) {
664 TAILQ_REMOVE(&sortlist
, l
, chain
);
667 * Sort the buffers into the per-lock list.
669 while ((lb
= TAILQ_FIRST(&l
->tosort
)) != NULL
) {
670 TAILQ_REMOVE(&l
->tosort
, lb
, lb_chain
.tailq
);
672 lb2
= TAILQ_FIRST(&l
->bufs
);
673 while (lb2
!= NULL
) {
675 if (lb
->lb_counts
[event
] >
676 lb2
->lb_counts
[event
])
678 } else if (lb
->lb_times
[event
] >
679 lb2
->lb_times
[event
])
681 lb2
= TAILQ_NEXT(lb2
, lb_chain
.tailq
);
684 TAILQ_INSERT_TAIL(&l
->bufs
, lb
,
687 TAILQ_INSERT_BEFORE(lb2
, lb
, lb_chain
.tailq
);
691 * Sort this lock into the per-type list, based on the
694 l2
= TAILQ_FIRST(&locklist
);
697 if (l
->count
> l2
->count
)
699 } else if (l
->time
> l2
->time
)
701 l2
= TAILQ_NEXT(l2
, chain
);
704 TAILQ_INSERT_TAIL(&locklist
, l
, chain
);
706 TAILQ_INSERT_BEFORE(l2
, l
, chain
);
711 * Display a summary table for one lock type / event type pair.
714 display(int mask
, const char *name
)
718 double pcscale
, metric
;
719 char fname
[NAME_SIZE
];
722 event
= (mask
& LB_EVENT_MASK
) - 1;
723 makelists(mask
, event
);
725 if (TAILQ_EMPTY(&locklist
))
728 fprintf(outfp
, "\n-- %s\n\n"
729 "Total%% Count Time/ms Lock Caller\n"
730 "------ ------- --------- ---------------------- ------------------------------\n",
734 * Sum up all events for this type of lock + event.
737 TAILQ_FOREACH(l
, &locklist
, chain
) {
747 pcscale
= (100.0 / pcscale
);
750 * For each lock, print a summary total, followed by a breakdown by
753 TAILQ_FOREACH(l
, &locklist
, chain
) {
760 if (l
->name
[0] == '\0')
761 findsym(LOCK_BYADDR
, l
->name
, &l
->lock
, NULL
, false);
763 if (lflag
|| l
->nbufs
> 1)
764 fprintf(outfp
, "%6.2f %7d %9.2f %-22s <all>\n",
765 metric
, (int)(l
->count
* cscale
),
766 l
->time
* tscale
, l
->name
);
771 TAILQ_FOREACH(lb
, &l
->bufs
, lb_chain
.tailq
) {
773 metric
= lb
->lb_counts
[event
];
775 metric
= lb
->lb_times
[event
];
778 findsym(FUNC_BYADDR
, fname
, &lb
->lb_callsite
, NULL
,
780 fprintf(outfp
, "%6.2f %7d %9.2f %-22s %s\n",
781 metric
, (int)(lb
->lb_counts
[event
] * cscale
),
782 lb
->lb_times
[event
] * tscale
, l
->name
, fname
);