don't use local-file-change context for mapreduce
[hiphop-php.git] / hphp / util / perf-event.cpp
blobcaf67e53d8d14b6b658564743748fc535b487aae
1 /*
2 +----------------------------------------------------------------------+
3 | HipHop for PHP |
4 +----------------------------------------------------------------------+
5 | Copyright (c) 2010-present Facebook, Inc. (http://www.facebook.com) |
6 +----------------------------------------------------------------------+
7 | This source file is subject to version 3.01 of the PHP license, |
8 | that is bundled with this package in the file LICENSE, and is |
9 | available through the world-wide-web at the following url: |
10 | http://www.php.net/license/3_01.txt |
11 | If you did not receive a copy of the PHP license and are unable to |
12 | obtain it through the world-wide-web, please send a note to |
13 | license@php.net so we can mail you a copy immediately. |
14 +----------------------------------------------------------------------+
17 #include "hphp/util/perf-event.h"
19 #if defined(__linux__) && defined(__x86_64__) && defined(FACEBOOK)
21 #include "hphp/util/assertions.h"
22 #include "hphp/util/logger.h"
23 #include "hphp/util/safe-cast.h"
25 #include <folly/FileUtil.h>
26 #include <folly/Optional.h>
27 #include <folly/String.h>
29 #include <mutex>
30 #include <string>
32 #include <asm/unistd.h>
33 #include <errno.h>
34 #include <fcntl.h>
35 #include <poll.h>
36 #include <signal.h>
37 #include <sys/ioctl.h>
38 #include <sys/mman.h>
39 #include <sys/syscall.h>
40 #include <sys/types.h>
41 #include <unistd.h>
43 // These two files must be included in this relative order, because the latter
44 // transitively includes a local copy of the former unless it detects that the
45 // system version has already been included.
46 #include <linux/perf_event.h>
47 #include <perfmon/pfmlib_perf_event.h>
49 namespace HPHP {
51 namespace {
53 ///////////////////////////////////////////////////////////////////////////////
56 * Process initialization bit and lock.
58 bool s_did_init = false;
59 std::mutex s_init_lock;
62 * Page size.
64 size_t s_pagesz = 0;
67 * Microarch-dependent event names for perf's cpu/mem-{loads,stores}/ events,
68 * in a form understood by libpfm4.
70 * We could just encode the `config' for perf_event_attr ourselves, but libpfm4
71 * does other things for us, like set the exclusion bits, and the encoding is
72 * not well-specified in the first place. Instead, it just means we had to
73 * match some bits to names ahead of time.
75 * These may be altered when the module is initialized.
77 // On Haswell and later, this is called "LOAD_LATENCY".
78 const char* s_mem_loads = "MEM_TRANS_RETIRED:LATENCY_ABOVE_THRESHOLD";
79 // On Haswell and later, "MEM_UOPS_RETIRED:ALL_STORES" is used instead.
80 const char* s_mem_stores = "MEM_TRANS_RETIRED:PRECISE_STORE";
82 ///////////////////////////////////////////////////////////////////////////////
85 * Metadata for a fully set up perf_event.
87 struct perf_event_handle {
88 perf_event_handle() {}
89 perf_event_handle(int fd, struct perf_event_mmap_page* meta)
90 : fd(fd)
91 , meta(meta)
94 // File descriptor of the opened perf_event.
95 int fd{-1};
97 // Metadata header page, followed by the ring buffer for samples.
98 struct perf_event_mmap_page* meta{nullptr};
100 // Buffer for samples that wrap around.
101 char* buf{nullptr};
102 size_t buf_sz{0};
106 * Per-thread perf_event metadata.
108 thread_local struct {
109 perf_event_handle loads;
110 perf_event_handle stores;
111 perf_event_signal_fn_t signal;
112 } tl_perf_event = {};
115 * Ensure that this module is properly initialized.
117 * Returns true if the module has been initialized successfully (by anyone),
118 * else false.
120 bool perf_event_init() {
121 if (s_did_init) return true;
123 std::lock_guard<std::mutex> l(s_init_lock);
124 if (s_did_init) return true;
126 s_pagesz = sysconf(_SC_PAGESIZE);
128 std::string event_str;
129 if (folly::readFile("/sys/devices/cpu/events/mem-stores", event_str)) {
130 // If the read fails, we'll stick with the {Sandy,Ivy}Bridge event name.
131 // Otherwise, check for the Haswell encoding string.
133 // @see: linux/arch/x86/events/intel/core.c.
134 if (event_str == "event=0xd0,umask=0x82") {
135 s_mem_stores = "MEM_UOPS_RETIRED:ALL_STORES";
137 // `event_str' should be "event=0xcd,umask=0x2" on *Bridge, but we don't
138 // care since we're using that event name as our default.
141 // libpfm4 needs to be initialized exactly once per process lifetime.
142 auto const pfmr = pfm_initialize();
143 if (pfmr != PFM_SUCCESS) {
144 Logger::Warning("perf_event: pfm_initialize failed: %s",
145 pfm_strerror(pfmr));
146 return false;
148 s_did_init = true;
149 return true;
153 * Size of the mmap'd perf_event output ring buffer.
155 * Must be exactly 2^n pages for some `n' (or 1 + 2^n, if we include the
156 * perf_event header page).
158 size_t buffer_sz() { return s_pagesz * (1 << 5); } // ring buffer only
159 size_t mmap_sz() { return s_pagesz + buffer_sz(); } // with header
161 ///////////////////////////////////////////////////////////////////////////////
164 * Register that a perf event was generated.
166 void signal_event(int sig, siginfo_t* info, void* /*context*/) {
167 if (sig != SIGIO || info == nullptr) return;
169 // Older versions of Linux have SIGIO here; newer versions have POLLIN.
170 if (info->si_code != SIGIO && info->si_code != POLLIN) return;
171 // We only care about read signals.
172 if ((info->si_band & POLLERR) || (info->si_band & POLLNVAL)) return;
173 if (!(info->si_band & POLLIN)) return;
175 if (tl_perf_event.signal == nullptr) return;
177 auto const type = [&]() -> folly::Optional<PerfEvent> {
178 if (info->si_fd == tl_perf_event.loads.fd) return PerfEvent::Load;
179 if (info->si_fd == tl_perf_event.stores.fd) return PerfEvent::Store;
180 return folly::none;
181 }();
182 if (!type) return;
184 tl_perf_event.signal(*type);
188 * Install `signal_event' to notify the user of new perf_event samples.
190 * Returns true if the handler was successfully installed, else false. If a
191 * handler for SIGIO was already installed, this will fail. Otherwise, if we
192 * install `signal_event' successfully, SIGIO will be unconditionally unblocked
193 * for the calling thread.
195 bool install_sigio_handler() {
196 struct sigaction old_action;
198 if (sigaction(SIGIO, nullptr, &old_action) < 0) {
199 Logger::Warning("perf_event: could not install SIGIO handler: %s",
200 folly::errnoStr(errno).c_str());
201 return false;
204 // Fail if a competing SIGIO handler is found.
205 if (old_action.sa_handler != SIG_DFL &&
206 old_action.sa_handler != SIG_IGN &&
207 old_action.sa_sigaction != signal_event) {
208 Logger::Warning("perf_event: could not install SIGIO handler: "
209 "found existing handler");
210 return false;
213 // Install our signal handler for SIGIO.
214 struct sigaction action = {};
215 action.sa_sigaction = signal_event;
216 action.sa_flags = SA_SIGINFO;
218 if (sigaction(SIGIO, &action, nullptr) < 0) {
219 Logger::Warning("perf_event: could not install SIGIO handler: %s",
220 folly::errnoStr(errno).c_str());
221 return false;
224 // Ensure that SIGIO is unblocked.
225 sigset_t sigs;
226 sigemptyset(&sigs);
227 sigaddset(&sigs, SIGIO);
228 if (pthread_sigmask(SIG_UNBLOCK, &sigs, nullptr) < 0) {
229 Logger::Warning("perf_event: could not unblock SIGIO: %s",
230 folly::errnoStr(errno).c_str());
231 return false;
234 return true;
237 ///////////////////////////////////////////////////////////////////////////////
240 * Pause or resume an event.
242 void pause_event(const perf_event_handle& pe) {
243 ioctl(pe.fd, PERF_EVENT_IOC_DISABLE, 0);
245 void resume_event(const perf_event_handle& pe) {
246 ioctl(pe.fd, PERF_EVENT_IOC_ENABLE, 0);
250 * Logically delete all events that are currently buffered for `pe'.
252 void clear_events(const perf_event_handle& pe) {
253 auto const data_head = pe.meta->data_head;
254 __sync_synchronize(); // smp_mb()
255 pe.meta->data_tail = data_head;
259 * Disable and close a perf event.
261 void close_event(const perf_event_handle& pe) {
262 clear_events(pe);
263 free(pe.buf);
264 ioctl(pe.fd, PERF_EVENT_IOC_DISABLE, 0);
265 munmap(pe.meta, mmap_sz());
266 close(pe.fd);
270 * Open a file descriptor for perf events with `event_name', mmap it, and set
271 * things up so that the calling thread receives SIGIO signals from it.
273 * Returns the perf_event_handle on success, else folly::none.
275 folly::Optional<perf_event_handle> enable_event(const char* event_name,
276 uint64_t sample_freq) {
277 struct perf_event_attr attr = {};
278 pfm_perf_encode_arg_t arg = {};
279 arg.attr = &attr;
280 arg.size = sizeof(arg);
282 // Populate the `type', `config', and `exclude_*' members on `attr'.
283 auto const pfmr = pfm_get_os_event_encoding(event_name, PFM_PLM3,
284 PFM_OS_PERF_EVENT, &arg);
285 if (pfmr != PFM_SUCCESS) {
286 Logger::Warning("perf_event: failed to get encoding for %s: %s",
287 event_name, pfm_strerror(pfmr));
288 return folly::none;
291 // Finish setting up `attr' and open the event.
292 attr.size = sizeof(attr);
293 attr.disabled = 1;
294 attr.sample_freq = sample_freq;
295 attr.freq = 1;
296 attr.watermark = 0;
297 attr.wakeup_events = 1;
298 attr.precise_ip = 2; // request zero skid
300 attr.sample_type = PERF_SAMPLE_IP
301 | PERF_SAMPLE_TID
302 | PERF_SAMPLE_ADDR
303 | PERF_SAMPLE_CALLCHAIN
304 | PERF_SAMPLE_DATA_SRC
307 auto const ret = syscall(__NR_perf_event_open, &attr, 0, -1, -1, 0);
308 if (ret < 0) {
309 // Some machines might not have PEBS support (needed for precise_ip > 0),
310 // but then PERF_SAMPLE_ADDR will always return zeros instead of the target
311 // memory address. Just fail silently in this case.
312 Logger::Verbose("perf_event: perf_event_open failed with: %s",
313 folly::errnoStr(errno).c_str());
314 return folly::none;
316 auto const fd = safe_cast<int>(ret);
318 // Recent versions of Linux have a CLOEXEC flag for perf_event_open(), but
319 // use fcntl() for portability. Note that since we do this after we open the
320 // event, this could in theory race with an exec() from another thread---but
321 // that shouldn't be happening anyway.
322 fcntl(fd, F_SETFD, O_CLOEXEC);
324 // Make sure that any SIGIO sent from `fd' is handled by the calling thread.
325 f_owner_ex owner;
326 owner.type = F_OWNER_TID;
327 owner.pid = syscall(__NR_gettid);
329 // Set up `fd' to send SIGIO with sigaction info.
330 if (fcntl(fd, F_SETFL, O_ASYNC) < 0 ||
331 fcntl(fd, F_SETSIG, SIGIO) < 0 ||
332 fcntl(fd, F_SETOWN_EX, &owner) < 0) {
333 Logger::Warning("perf_event: failed to set up asynchronous I/O: %s",
334 folly::errnoStr(errno).c_str());
335 close(fd);
336 return folly::none;
339 // Map the ring buffer for our samples.
340 auto const base = mmap(nullptr, mmap_sz(), PROT_READ | PROT_WRITE,
341 MAP_SHARED, fd, 0);
342 if (base == MAP_FAILED) {
343 Logger::Warning("perf_event: failed to mmap perf_event: %s",
344 folly::errnoStr(errno).c_str());
345 close(fd);
346 return folly::none;
348 auto const meta = reinterpret_cast<struct perf_event_mmap_page*>(base);
350 auto const pe = perf_event_handle { fd, meta };
352 // Reset the event. This seems to be present in most examples, but it's
353 // unclear if it's necessary or just good hygeine. (It's possible that it's
354 // necessary on successive opens.)
355 if (ioctl(fd, PERF_EVENT_IOC_RESET, 0) < 0) {
356 Logger::Warning("perf_event: failed to reset perf_event: %s",
357 folly::errnoStr(errno).c_str());
358 close_event(pe);
359 return folly::none;
362 // Enable the event. The man page and other examples of usage all suggest
363 // that the right thing to do is to start with the event disabled and then
364 // enable it manually afterwards, so we do the same here even though it seems
365 // strange and circuitous.
366 if (ioctl(fd, PERF_EVENT_IOC_ENABLE, 0) < 0) {
367 Logger::Warning("perf_event: failed to enable perf_event: %s",
368 folly::errnoStr(errno).c_str());
369 close_event(pe);
370 return folly::none;
373 return pe;
376 ///////////////////////////////////////////////////////////////////////////////
379 * Ensure that `pe.buf' can hold at least `cap' bytes.
381 void ensure_buffer_capacity(perf_event_handle& pe, size_t cap) {
382 if (pe.buf_sz >= cap) return;
383 free(pe.buf);
384 pe.buf = reinterpret_cast<char*>(malloc(cap * 2));
388 * Iterate through all the pending sampled events in `pe' and pass each one to
389 * `consume'.
391 void consume_events(PerfEvent kind, perf_event_handle& pe,
392 perf_event_consume_fn_t consume) {
393 auto const data_tail = pe.meta->data_tail;
394 auto const data_head = pe.meta->data_head;
396 asm volatile("" : : : "memory"); // smp_rmb()
397 if (data_head == data_tail) return;
399 auto const base = reinterpret_cast<char*>(pe.meta) + s_pagesz;
401 auto const begin = base + data_tail % buffer_sz();
402 auto const end = base + data_head % buffer_sz();
404 auto cur = begin;
406 while (cur != end) {
407 auto header = reinterpret_cast<struct perf_event_header*>(cur);
409 if (cur + header->size > base + buffer_sz()) {
410 // The current entry wraps around the ring buffer. Copy it into a stack
411 // buffer, and update `cur' to wrap around appropriately.
412 auto const prefix_len = base + buffer_sz() - cur;
414 ensure_buffer_capacity(pe, header->size);
416 memcpy(pe.buf, cur, prefix_len);
417 memcpy(pe.buf + prefix_len, base, header->size - prefix_len);
418 header = reinterpret_cast<struct perf_event_header*>(pe.buf);
420 cur = base + header->size - prefix_len;
421 } else if (cur + header->size == base + buffer_sz()) {
422 // Perfect wraparound.
423 cur = base;
424 } else {
425 cur += header->size;
428 if (header->type == PERF_RECORD_SAMPLE) {
429 auto const sample = reinterpret_cast<perf_event_sample*>(header + 1);
431 assertx(header->size == sizeof(struct perf_event_header) +
432 sizeof(perf_event_sample) +
433 sample->nr * sizeof(*sample->ips) +
434 sizeof(perf_event_sample_tail));
435 assertx((char*)(sample->tail() + 1) == (char*)header + header->size);
436 consume(kind, sample);
440 __sync_synchronize(); // smp_mb()
441 pe.meta->data_tail = data_head;
444 ///////////////////////////////////////////////////////////////////////////////
448 ///////////////////////////////////////////////////////////////////////////////
450 perf_event_data_src_info
451 perf_event_data_src(PerfEvent kind, uint64_t data_src) {
452 auto info = perf_event_data_src_info{};
454 DEBUG_ONLY auto const mem_op = data_src;
455 switch (kind) {
456 case PerfEvent::Load:
457 assertx(mem_op & PERF_MEM_OP_LOAD);
458 break;
459 case PerfEvent::Store:
460 assertx(mem_op & PERF_MEM_OP_STORE);
461 break;
464 auto const mem_lvl = data_src >> PERF_MEM_LVL_SHIFT;
466 if (mem_lvl & PERF_MEM_LVL_NA) {
467 info.mem_lvl = "(unknown)";
468 info.mem_hit = 0;
469 } else {
470 info.mem_hit = (mem_lvl & PERF_MEM_LVL_HIT) ? 1 :
471 (mem_lvl & PERF_MEM_LVL_MISS) ? -1 : 0;
473 #define MEM_LVLS \
474 X(L1) \
475 X(LFB) \
476 X(L2) \
477 X(L3) \
478 X(LOC_RAM) \
479 X(REM_RAM1) \
480 X(REM_RAM2) \
481 X(REM_CCE1) \
482 X(REM_CCE2) \
483 X(IO) \
484 X(UNC)
486 auto const mem_lvl_only = mem_lvl & (0x0
487 #define X(lvl) | PERF_MEM_LVL_##lvl
488 MEM_LVLS
489 #undef X
492 info.mem_lvl = [&]() -> const char* {
493 switch (mem_lvl_only) {
494 case 0x0: return "(none)";
495 #define X(lvl) \
496 case PERF_MEM_LVL_##lvl: return #lvl;
497 MEM_LVLS
498 #undef X
499 default: return "(mixed)";
501 }();
504 #undef MEM_LVLS
506 auto const mem_snoop = data_src >> PERF_MEM_SNOOP_SHIFT;
507 if (mem_snoop & PERF_MEM_SNOOP_NA) {
508 info.snoop = 0;
509 info.snoop_hit = 0;
510 info.snoop_hitm = 0;
511 } else {
512 info.snoop_hit = (mem_snoop & PERF_MEM_SNOOP_HIT) ? 1 :
513 (mem_snoop & PERF_MEM_SNOOP_MISS) ? -1 : 0;
514 info.snoop = (mem_snoop & PERF_MEM_SNOOP_NONE) ? -1 : 1;
515 info.snoop_hitm = (mem_snoop & PERF_MEM_SNOOP_HITM) ? 1 : -1;
518 auto const mem_lock = data_src >> PERF_MEM_LOCK_SHIFT;
519 info.locked = (mem_lock & PERF_MEM_LOCK_NA) ? 0 :
520 (mem_lock & PERF_MEM_LOCK_LOCKED) ? 1 : -1;
522 auto const mem_tlb = data_src >> PERF_MEM_TLB_SHIFT;
524 if (mem_tlb & PERF_MEM_TLB_NA) {
525 info.tlb = "(unknown)";
526 info.tlb_hit = 0;
527 } else {
528 info.tlb_hit = (mem_tlb & PERF_MEM_TLB_HIT) ? 1 :
529 (mem_tlb & PERF_MEM_TLB_MISS) ? -1 : 0;
531 #define TLBS \
532 X(L1) \
533 X(L2) \
534 X(WK) \
535 X(OS)
537 auto const tlb_only = mem_tlb & (0x0
538 #define X(tlb) | PERF_MEM_TLB_##tlb
539 TLBS
540 #undef X
543 info.tlb = [&]() -> const char* {
544 switch (tlb_only) {
545 case 0x0: return "(none)";
546 #define X(tlb) \
547 case PERF_MEM_TLB_##tlb: return #tlb;
548 TLBS
549 #undef X
550 case (PERF_MEM_TLB_L1 | PERF_MEM_TLB_L2): return "L1-L2";
551 default: return "(mixed)";
553 }();
556 return info;
559 ///////////////////////////////////////////////////////////////////////////////
561 bool perf_event_enable(uint64_t sample_freq, perf_event_signal_fn_t signal_fn) {
562 if (!perf_event_init()) return false;
564 // If `tl_perf_event' has already been initialized, we're done.
565 if (tl_perf_event.signal != nullptr) return true;
567 if (!install_sigio_handler()) return false;
569 auto const ld_pe = enable_event(s_mem_loads, sample_freq);
570 if (!ld_pe) return false;
572 auto const st_pe = enable_event(s_mem_stores, sample_freq);
573 if (!st_pe) {
574 close_event(*ld_pe);
575 return false;
578 // Set `tl_perf_event'---and in particular, `signal'---only after everything
579 // is enabled. This will cause us to ignore signals until we're ready to
580 // process the events.
581 tl_perf_event.loads = *ld_pe;
582 tl_perf_event.stores = *st_pe;
583 asm volatile("" : : : "memory");
584 tl_perf_event.signal = signal_fn;
586 return true;
589 void perf_event_pause() {
590 if (tl_perf_event.signal == nullptr) return;
591 pause_event(tl_perf_event.loads);
592 pause_event(tl_perf_event.stores);
595 void perf_event_resume() {
596 if (tl_perf_event.signal == nullptr) return;
597 resume_event(tl_perf_event.loads);
598 resume_event(tl_perf_event.stores);
601 void perf_event_disable() {
602 if (tl_perf_event.signal == nullptr) return;
604 close_event(tl_perf_event.loads);
605 close_event(tl_perf_event.stores);
606 tl_perf_event = {};
609 void perf_event_consume(perf_event_consume_fn_t consume) {
610 if (tl_perf_event.signal == nullptr) return;
612 consume_events(PerfEvent::Load, tl_perf_event.loads, consume);
613 consume_events(PerfEvent::Store, tl_perf_event.stores, consume);
616 ///////////////////////////////////////////////////////////////////////////////
620 #else // defined(__linux__) && defined(__x86_64__)
622 namespace HPHP {
624 perf_event_data_src_info
625 perf_event_data_src(PerfEvent kind, uint64_t data_src) {
626 return perf_event_data_src_info{};
629 bool perf_event_enable(uint64_t, perf_event_signal_fn_t) { return false; }
630 void perf_event_disable() {}
631 void perf_event_pause() {}
632 void perf_event_resume() {}
633 void perf_event_consume(perf_event_consume_fn_t) {}
637 #endif