4 * Builtin record command: Record the profile of a workload
5 * (or a CPU, or a PID) into the perf.data output file - for
6 * later analysis via perf report.
8 #define _FILE_OFFSET_BITS 64
14 #include "util/build-id.h"
15 #include "util/util.h"
16 #include "util/parse-options.h"
17 #include "util/parse-events.h"
19 #include "util/header.h"
20 #include "util/event.h"
21 #include "util/evlist.h"
22 #include "util/evsel.h"
23 #include "util/debug.h"
24 #include "util/session.h"
25 #include "util/symbol.h"
26 #include "util/cpumap.h"
27 #include "util/thread_map.h"
33 #define FD(e, x, y) (*(int *)xyarray__entry(e->fd, x, y))
40 static u64 user_interval
= ULLONG_MAX
;
41 static u64 default_interval
= 0;
43 static unsigned int page_size
;
44 static unsigned int mmap_pages
= UINT_MAX
;
45 static unsigned int user_freq
= UINT_MAX
;
46 static int freq
= 1000;
48 static int pipe_output
= 0;
49 static const char *output_name
= NULL
;
51 static int realtime_prio
= 0;
52 static bool nodelay
= false;
53 static bool raw_samples
= false;
54 static bool sample_id_all_avail
= true;
55 static bool system_wide
= false;
56 static pid_t target_pid
= -1;
57 static pid_t target_tid
= -1;
58 static pid_t child_pid
= -1;
59 static bool no_inherit
= false;
60 static enum write_mode_t write_mode
= WRITE_FORCE
;
61 static bool call_graph
= false;
62 static bool inherit_stat
= false;
63 static bool no_samples
= false;
64 static bool sample_address
= false;
65 static bool sample_time
= false;
66 static bool no_buildid
= false;
67 static bool no_buildid_cache
= false;
68 static struct perf_evlist
*evsel_list
;
70 static long samples
= 0;
71 static u64 bytes_written
= 0;
73 static int file_new
= 1;
74 static off_t post_processing_offset
;
76 static struct perf_session
*session
;
77 static const char *cpu_list
;
79 static void advance_output(size_t size
)
81 bytes_written
+= size
;
84 static void write_output(void *buf
, size_t size
)
87 int ret
= write(output
, buf
, size
);
90 die("failed to write");
99 static int process_synthesized_event(union perf_event
*event
,
100 struct perf_sample
*sample __used
,
101 struct perf_session
*self __used
)
103 write_output(event
, event
->header
.size
);
107 static void mmap_read(struct perf_mmap
*md
)
109 unsigned int head
= perf_mmap__read_head(md
);
110 unsigned int old
= md
->prev
;
111 unsigned char *data
= md
->base
+ page_size
;
122 if ((old
& md
->mask
) + size
!= (head
& md
->mask
)) {
123 buf
= &data
[old
& md
->mask
];
124 size
= md
->mask
+ 1 - (old
& md
->mask
);
127 write_output(buf
, size
);
130 buf
= &data
[old
& md
->mask
];
134 write_output(buf
, size
);
137 perf_mmap__write_tail(md
, old
);
140 static volatile int done
= 0;
141 static volatile int signr
= -1;
143 static void sig_handler(int sig
)
149 static void sig_atexit(void)
152 kill(child_pid
, SIGTERM
);
154 if (signr
== -1 || signr
== SIGUSR1
)
157 signal(signr
, SIG_DFL
);
158 kill(getpid(), signr
);
161 static void config_attr(struct perf_evsel
*evsel
, struct perf_evlist
*evlist
)
163 struct perf_event_attr
*attr
= &evsel
->attr
;
164 int track
= !evsel
->idx
; /* only the first counter needs these */
166 attr
->inherit
= !no_inherit
;
167 attr
->read_format
= PERF_FORMAT_TOTAL_TIME_ENABLED
|
168 PERF_FORMAT_TOTAL_TIME_RUNNING
|
171 attr
->sample_type
|= PERF_SAMPLE_IP
| PERF_SAMPLE_TID
;
173 if (evlist
->nr_entries
> 1)
174 attr
->sample_type
|= PERF_SAMPLE_ID
;
177 * We default some events to a 1 default interval. But keep
178 * it a weak assumption overridable by the user.
180 if (!attr
->sample_period
|| (user_freq
!= UINT_MAX
&&
181 user_interval
!= ULLONG_MAX
)) {
183 attr
->sample_type
|= PERF_SAMPLE_PERIOD
;
185 attr
->sample_freq
= freq
;
187 attr
->sample_period
= default_interval
;
192 attr
->sample_freq
= 0;
195 attr
->inherit_stat
= 1;
197 if (sample_address
) {
198 attr
->sample_type
|= PERF_SAMPLE_ADDR
;
199 attr
->mmap_data
= track
;
203 attr
->sample_type
|= PERF_SAMPLE_CALLCHAIN
;
206 attr
->sample_type
|= PERF_SAMPLE_CPU
;
208 if (sample_id_all_avail
&&
209 (sample_time
|| system_wide
|| !no_inherit
|| cpu_list
))
210 attr
->sample_type
|= PERF_SAMPLE_TIME
;
213 attr
->sample_type
|= PERF_SAMPLE_TIME
;
214 attr
->sample_type
|= PERF_SAMPLE_RAW
;
215 attr
->sample_type
|= PERF_SAMPLE_CPU
;
220 attr
->wakeup_events
= 1;
226 if (target_pid
== -1 && target_tid
== -1 && !system_wide
) {
228 attr
->enable_on_exec
= 1;
232 static bool perf_evlist__equal(struct perf_evlist
*evlist
,
233 struct perf_evlist
*other
)
235 struct perf_evsel
*pos
, *pair
;
237 if (evlist
->nr_entries
!= other
->nr_entries
)
240 pair
= list_entry(other
->entries
.next
, struct perf_evsel
, node
);
242 list_for_each_entry(pos
, &evlist
->entries
, node
) {
243 if (memcmp(&pos
->attr
, &pair
->attr
, sizeof(pos
->attr
) != 0))
245 pair
= list_entry(pair
->node
.next
, struct perf_evsel
, node
);
251 static void open_counters(struct perf_evlist
*evlist
)
253 struct perf_evsel
*pos
;
255 if (evlist
->cpus
->map
[0] < 0)
258 list_for_each_entry(pos
, &evlist
->entries
, node
) {
259 struct perf_event_attr
*attr
= &pos
->attr
;
261 * Check if parse_single_tracepoint_event has already asked for
264 * XXX this is kludgy but short term fix for problems introduced by
265 * eac23d1c that broke 'perf script' by having different sample_types
266 * when using multiple tracepoint events when we use a perf binary
267 * that tries to use sample_id_all on an older kernel.
269 * We need to move counter creation to perf_session, support
270 * different sample_types, etc.
272 bool time_needed
= attr
->sample_type
& PERF_SAMPLE_TIME
;
274 config_attr(pos
, evlist
);
276 attr
->sample_id_all
= sample_id_all_avail
? 1 : 0;
278 if (perf_evsel__open(pos
, evlist
->cpus
, evlist
->threads
, group
) < 0) {
281 if (err
== EPERM
|| err
== EACCES
) {
282 ui__warning_paranoid();
284 } else if (err
== ENODEV
&& cpu_list
) {
285 die("No such device - did you specify"
286 " an out-of-range profile CPU?\n");
287 } else if (err
== EINVAL
&& sample_id_all_avail
) {
289 * Old kernel, no attr->sample_id_type_all field
291 sample_id_all_avail
= false;
292 if (!sample_time
&& !raw_samples
&& !time_needed
)
293 attr
->sample_type
&= ~PERF_SAMPLE_TIME
;
295 goto retry_sample_id
;
299 * If it's cycles then fall back to hrtimer
300 * based cpu-clock-tick sw counter, which
301 * is always available even if no PMU support:
303 if (attr
->type
== PERF_TYPE_HARDWARE
304 && attr
->config
== PERF_COUNT_HW_CPU_CYCLES
) {
307 ui__warning("The cycles event is not supported, "
308 "trying to fall back to cpu-clock-ticks\n");
309 attr
->type
= PERF_TYPE_SOFTWARE
;
310 attr
->config
= PERF_COUNT_SW_CPU_CLOCK
;
315 ui__warning("The %s event is not supported.\n",
321 error("sys_perf_event_open() syscall returned with %d (%s). /bin/dmesg may provide additional information.\n",
324 #if defined(__i386__) || defined(__x86_64__)
325 if (attr
->type
== PERF_TYPE_HARDWARE
&& err
== EOPNOTSUPP
)
326 die("No hardware sampling interrupt available."
327 " No APIC? If so then you can boot the kernel"
328 " with the \"lapic\" boot parameter to"
329 " force-enable it.\n");
332 die("No CONFIG_PERF_EVENTS=y kernel support configured?\n");
336 if (perf_evlist__set_filters(evlist
)) {
337 error("failed to set filter with %d (%s)\n", errno
,
342 if (perf_evlist__mmap(evlist
, mmap_pages
, false) < 0)
343 die("failed to mmap with %d (%s)\n", errno
, strerror(errno
));
346 session
->evlist
= evlist
;
348 if (!perf_evlist__equal(session
->evlist
, evlist
)) {
349 fprintf(stderr
, "incompatible append\n");
354 perf_session__update_sample_type(session
);
357 static int process_buildids(void)
359 u64 size
= lseek(output
, 0, SEEK_CUR
);
364 session
->fd
= output
;
365 return __perf_session__process_events(session
, post_processing_offset
,
366 size
- post_processing_offset
,
367 size
, &build_id__mark_dso_hit_ops
);
370 static void atexit_header(void)
373 session
->header
.data_size
+= bytes_written
;
377 perf_session__write_header(session
, evsel_list
, output
, true);
378 perf_session__delete(session
);
379 perf_evlist__delete(evsel_list
);
384 static void perf_event__synthesize_guest_os(struct machine
*machine
, void *data
)
387 struct perf_session
*psession
= data
;
389 if (machine__is_host(machine
))
393 *As for guest kernel when processing subcommand record&report,
394 *we arrange module mmap prior to guest kernel mmap and trigger
395 *a preload dso because default guest module symbols are loaded
396 *from guest kallsyms instead of /lib/modules/XXX/XXX. This
397 *method is used to avoid symbol missing when the first addr is
398 *in module instead of in guest kernel.
400 err
= perf_event__synthesize_modules(process_synthesized_event
,
403 pr_err("Couldn't record guest kernel [%d]'s reference"
404 " relocation symbol.\n", machine
->pid
);
407 * We use _stext for guest kernel because guest kernel's /proc/kallsyms
408 * have no _text sometimes.
410 err
= perf_event__synthesize_kernel_mmap(process_synthesized_event
,
411 psession
, machine
, "_text");
413 err
= perf_event__synthesize_kernel_mmap(process_synthesized_event
,
417 pr_err("Couldn't record guest kernel [%d]'s reference"
418 " relocation symbol.\n", machine
->pid
);
421 static struct perf_event_header finished_round_event
= {
422 .size
= sizeof(struct perf_event_header
),
423 .type
= PERF_RECORD_FINISHED_ROUND
,
426 static void mmap_read_all(void)
430 for (i
= 0; i
< evsel_list
->nr_mmaps
; i
++) {
431 if (evsel_list
->mmap
[i
].base
)
432 mmap_read(&evsel_list
->mmap
[i
]);
435 if (perf_header__has_feat(&session
->header
, HEADER_TRACE_INFO
))
436 write_output(&finished_round_event
, sizeof(finished_round_event
));
439 static int __cmd_record(int argc
, const char **argv
)
445 unsigned long waking
= 0;
446 int child_ready_pipe
[2], go_pipe
[2];
447 const bool forks
= argc
> 0;
449 struct machine
*machine
;
451 page_size
= sysconf(_SC_PAGE_SIZE
);
454 signal(SIGCHLD
, sig_handler
);
455 signal(SIGINT
, sig_handler
);
456 signal(SIGUSR1
, sig_handler
);
458 if (forks
&& (pipe(child_ready_pipe
) < 0 || pipe(go_pipe
) < 0)) {
459 perror("failed to create pipes");
464 if (!fstat(STDOUT_FILENO
, &st
) && S_ISFIFO(st
.st_mode
))
467 output_name
= "perf.data";
470 if (!strcmp(output_name
, "-"))
472 else if (!stat(output_name
, &st
) && st
.st_size
) {
473 if (write_mode
== WRITE_FORCE
) {
474 char oldname
[PATH_MAX
];
475 snprintf(oldname
, sizeof(oldname
), "%s.old",
478 rename(output_name
, oldname
);
480 } else if (write_mode
== WRITE_APPEND
) {
481 write_mode
= WRITE_FORCE
;
485 flags
= O_CREAT
|O_RDWR
;
486 if (write_mode
== WRITE_APPEND
)
492 output
= STDOUT_FILENO
;
494 output
= open(output_name
, flags
, S_IRUSR
| S_IWUSR
);
496 perror("failed to create output file");
500 session
= perf_session__new(output_name
, O_WRONLY
,
501 write_mode
== WRITE_FORCE
, false, NULL
);
502 if (session
== NULL
) {
503 pr_err("Not enough memory for reading perf file header\n");
508 perf_header__set_feat(&session
->header
, HEADER_BUILD_ID
);
511 err
= perf_session__read_header(session
, output
);
513 goto out_delete_session
;
516 if (have_tracepoints(&evsel_list
->entries
))
517 perf_header__set_feat(&session
->header
, HEADER_TRACE_INFO
);
519 /* 512 kiB: default amount of unprivileged mlocked memory */
520 if (mmap_pages
== UINT_MAX
)
521 mmap_pages
= (512 * 1024) / page_size
;
526 perror("failed to fork");
533 close(child_ready_pipe
[0]);
535 fcntl(go_pipe
[0], F_SETFD
, FD_CLOEXEC
);
538 * Do a dummy execvp to get the PLT entry resolved,
539 * so we avoid the resolver overhead on the real
542 execvp("", (char **)argv
);
545 * Tell the parent we're ready to go
547 close(child_ready_pipe
[1]);
550 * Wait until the parent tells us to go.
552 if (read(go_pipe
[0], &buf
, 1) == -1)
553 perror("unable to read pipe");
555 execvp(argv
[0], (char **)argv
);
558 kill(getppid(), SIGUSR1
);
562 if (!system_wide
&& target_tid
== -1 && target_pid
== -1)
563 evsel_list
->threads
->map
[0] = child_pid
;
565 close(child_ready_pipe
[1]);
568 * wait for child to settle
570 if (read(child_ready_pipe
[0], &buf
, 1) == -1) {
571 perror("unable to read pipe");
574 close(child_ready_pipe
[0]);
577 open_counters(evsel_list
);
580 * perf_session__delete(session) will be called at atexit_header()
582 atexit(atexit_header
);
585 err
= perf_header__write_pipe(output
);
588 } else if (file_new
) {
589 err
= perf_session__write_header(session
, evsel_list
,
595 post_processing_offset
= lseek(output
, 0, SEEK_CUR
);
598 err
= perf_session__synthesize_attrs(session
,
599 process_synthesized_event
);
601 pr_err("Couldn't synthesize attrs.\n");
605 err
= perf_event__synthesize_event_types(process_synthesized_event
,
608 pr_err("Couldn't synthesize event_types.\n");
612 if (have_tracepoints(&evsel_list
->entries
)) {
614 * FIXME err <= 0 here actually means that
615 * there were no tracepoints so its not really
616 * an error, just that we don't need to
617 * synthesize anything. We really have to
618 * return this more properly and also
619 * propagate errors that now are calling die()
621 err
= perf_event__synthesize_tracing_data(output
, evsel_list
,
622 process_synthesized_event
,
625 pr_err("Couldn't record tracing data.\n");
632 machine
= perf_session__find_host_machine(session
);
634 pr_err("Couldn't find native kernel information.\n");
638 err
= perf_event__synthesize_kernel_mmap(process_synthesized_event
,
639 session
, machine
, "_text");
641 err
= perf_event__synthesize_kernel_mmap(process_synthesized_event
,
642 session
, machine
, "_stext");
644 pr_err("Couldn't record kernel reference relocation symbol\n"
645 "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
646 "Check /proc/kallsyms permission or run as root.\n");
648 err
= perf_event__synthesize_modules(process_synthesized_event
,
651 pr_err("Couldn't record kernel module information.\n"
652 "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
653 "Check /proc/modules permission or run as root.\n");
656 perf_session__process_machines(session
,
657 perf_event__synthesize_guest_os
);
660 perf_event__synthesize_thread_map(evsel_list
->threads
,
661 process_synthesized_event
,
664 perf_event__synthesize_threads(process_synthesized_event
,
668 struct sched_param param
;
670 param
.sched_priority
= realtime_prio
;
671 if (sched_setscheduler(0, SCHED_FIFO
, ¶m
)) {
672 pr_err("Could not set realtime priority.\n");
689 if (hits
== samples
) {
692 err
= poll(evsel_list
->pollfd
, evsel_list
->nr_fds
, -1);
697 for (i
= 0; i
< evsel_list
->cpus
->nr
; i
++) {
698 struct perf_evsel
*pos
;
700 list_for_each_entry(pos
, &evsel_list
->entries
, node
) {
702 thread
< evsel_list
->threads
->nr
;
704 ioctl(FD(pos
, i
, thread
),
705 PERF_EVENT_IOC_DISABLE
);
711 if (quiet
|| signr
== SIGUSR1
)
714 fprintf(stderr
, "[ perf record: Woken up %ld times to write data ]\n", waking
);
717 * Approximate RIP event size: 24 bytes.
720 "[ perf record: Captured and wrote %.3f MB %s (~%" PRIu64
" samples) ]\n",
721 (double)bytes_written
/ 1024.0 / 1024.0,
728 perf_session__delete(session
);
732 static const char * const record_usage
[] = {
733 "perf record [<options>] [<command>]",
734 "perf record [<options>] -- <command> [<options>]",
738 static bool force
, append_file
;
740 const struct option record_options
[] = {
741 OPT_CALLBACK('e', "event", &evsel_list
, "event",
742 "event selector. use 'perf list' to list available events",
743 parse_events_option
),
744 OPT_CALLBACK(0, "filter", &evsel_list
, "filter",
745 "event filter", parse_filter
),
746 OPT_INTEGER('p', "pid", &target_pid
,
747 "record events on existing process id"),
748 OPT_INTEGER('t', "tid", &target_tid
,
749 "record events on existing thread id"),
750 OPT_INTEGER('r', "realtime", &realtime_prio
,
751 "collect data with this RT SCHED_FIFO priority"),
752 OPT_BOOLEAN('D', "no-delay", &nodelay
,
753 "collect data without buffering"),
754 OPT_BOOLEAN('R', "raw-samples", &raw_samples
,
755 "collect raw sample records from all opened counters"),
756 OPT_BOOLEAN('a', "all-cpus", &system_wide
,
757 "system-wide collection from all CPUs"),
758 OPT_BOOLEAN('A', "append", &append_file
,
759 "append to the output file to do incremental profiling"),
760 OPT_STRING('C', "cpu", &cpu_list
, "cpu",
761 "list of cpus to monitor"),
762 OPT_BOOLEAN('f', "force", &force
,
763 "overwrite existing data file (deprecated)"),
764 OPT_U64('c', "count", &user_interval
, "event period to sample"),
765 OPT_STRING('o', "output", &output_name
, "file",
767 OPT_BOOLEAN('i', "no-inherit", &no_inherit
,
768 "child tasks do not inherit counters"),
769 OPT_UINTEGER('F', "freq", &user_freq
, "profile at this frequency"),
770 OPT_UINTEGER('m', "mmap-pages", &mmap_pages
, "number of mmap data pages"),
771 OPT_BOOLEAN('g', "call-graph", &call_graph
,
772 "do call-graph (stack chain/backtrace) recording"),
773 OPT_INCR('v', "verbose", &verbose
,
774 "be more verbose (show counter open errors, etc)"),
775 OPT_BOOLEAN('q', "quiet", &quiet
, "don't print any message"),
776 OPT_BOOLEAN('s', "stat", &inherit_stat
,
777 "per thread counts"),
778 OPT_BOOLEAN('d', "data", &sample_address
,
780 OPT_BOOLEAN('T', "timestamp", &sample_time
, "Sample timestamps"),
781 OPT_BOOLEAN('n', "no-samples", &no_samples
,
783 OPT_BOOLEAN('N', "no-buildid-cache", &no_buildid_cache
,
784 "do not update the buildid cache"),
785 OPT_BOOLEAN('B', "no-buildid", &no_buildid
,
786 "do not collect buildids in perf.data"),
787 OPT_CALLBACK('G', "cgroup", &evsel_list
, "name",
788 "monitor event in cgroup name only",
793 int cmd_record(int argc
, const char **argv
, const char *prefix __used
)
796 struct perf_evsel
*pos
;
798 evsel_list
= perf_evlist__new(NULL
, NULL
);
799 if (evsel_list
== NULL
)
802 argc
= parse_options(argc
, argv
, record_options
, record_usage
,
803 PARSE_OPT_STOP_AT_NON_OPTION
);
804 if (!argc
&& target_pid
== -1 && target_tid
== -1 &&
805 !system_wide
&& !cpu_list
)
806 usage_with_options(record_usage
, record_options
);
808 if (force
&& append_file
) {
809 fprintf(stderr
, "Can't overwrite and append at the same time."
810 " You need to choose between -f and -A");
811 usage_with_options(record_usage
, record_options
);
812 } else if (append_file
) {
813 write_mode
= WRITE_APPEND
;
815 write_mode
= WRITE_FORCE
;
818 if (nr_cgroups
&& !system_wide
) {
819 fprintf(stderr
, "cgroup monitoring only available in"
820 " system-wide mode\n");
821 usage_with_options(record_usage
, record_options
);
826 if (symbol_conf
.kptr_restrict
)
828 "WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted,\n"
829 "check /proc/sys/kernel/kptr_restrict.\n\n"
830 "Samples in kernel functions may not be resolved if a suitable vmlinux\n"
831 "file is not found in the buildid cache or in the vmlinux path.\n\n"
832 "Samples in kernel modules won't be resolved at all.\n\n"
833 "If some relocation was applied (e.g. kexec) symbols may be misresolved\n"
834 "even with a suitable vmlinux or kallsyms file.\n\n");
836 if (no_buildid_cache
|| no_buildid
)
837 disable_buildid_cache();
839 if (evsel_list
->nr_entries
== 0 &&
840 perf_evlist__add_default(evsel_list
) < 0) {
841 pr_err("Not enough memory for event selector list\n");
842 goto out_symbol_exit
;
845 if (target_pid
!= -1)
846 target_tid
= target_pid
;
848 if (perf_evlist__create_maps(evsel_list
, target_pid
,
849 target_tid
, cpu_list
) < 0)
850 usage_with_options(record_usage
, record_options
);
852 list_for_each_entry(pos
, &evsel_list
->entries
, node
) {
853 if (perf_evsel__alloc_fd(pos
, evsel_list
->cpus
->nr
,
854 evsel_list
->threads
->nr
) < 0)
856 if (perf_header__push_event(pos
->attr
.config
, event_name(pos
)))
860 if (perf_evlist__alloc_pollfd(evsel_list
) < 0)
863 if (user_interval
!= ULLONG_MAX
)
864 default_interval
= user_interval
;
865 if (user_freq
!= UINT_MAX
)
869 * User specified count overrides default frequency.
871 if (default_interval
)
874 default_interval
= freq
;
876 fprintf(stderr
, "frequency and count are zero, aborting\n");
881 err
= __cmd_record(argc
, argv
);
883 perf_evlist__delete_maps(evsel_list
);